[gemma-4-multimodal] Vision-Based State Verification for Morrowind Agent #1487
285
scripts/visual_state_verifier.py
Normal file
285
scripts/visual_state_verifier.py
Normal file
@@ -0,0 +1,285 @@
|
|||||||
|
"""
|
||||||
|
Visual State Verification Module for Game Agents
|
||||||
|
=================================================
|
||||||
|
|
||||||
|
Provides screenshot-based environmental state verification for game agents
|
||||||
|
(Morrowind, Minecraft, or any game with a screenshot API). Uses multimodal
|
||||||
|
analysis to confirm agent expectations match actual game state.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
from scripts.visual_state_verifier import VisualStateVerifier
|
||||||
|
|
||||||
|
verifier = VisualStateVerifier()
|
||||||
|
result = verifier.verify_state(
|
||||||
|
screenshot_path="/tmp/game_screenshot.png",
|
||||||
|
expected_state={"location": "Balmora", "health_above": 50, "has_weapon": True},
|
||||||
|
context="Player should be in Balmora with a weapon equipped"
|
||||||
|
)
|
||||||
|
print(result.verified) # True/False
|
||||||
|
print(result.details) # Human-readable analysis
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from enum import Enum
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
class VerificationStatus(Enum):
|
||||||
|
"""Status of a visual state verification."""
|
||||||
|
VERIFIED = "verified"
|
||||||
|
FAILED = "failed"
|
||||||
|
UNCERTAIN = "uncertain"
|
||||||
|
ERROR = "error"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class VerificationResult:
|
||||||
|
"""Result of a visual state verification."""
|
||||||
|
status: VerificationStatus
|
||||||
|
verified: bool
|
||||||
|
confidence: float # 0.0 - 1.0
|
||||||
|
details: str
|
||||||
|
expected: dict
|
||||||
|
observed: dict = field(default_factory=dict)
|
||||||
|
mismatches: list = field(default_factory=list)
|
||||||
|
screenshot_path: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class VisualStateVerifier:
|
||||||
|
"""
|
||||||
|
Verifies game state by analyzing screenshots against expected conditions.
|
||||||
|
|
||||||
|
Supports any game that can produce screenshots. Designed for integration
|
||||||
|
with MCP screenshot tools and vision analysis capabilities.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, vision_backend: str = "builtin"):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
vision_backend: "builtin" for MCP vision, "ollama" for local model
|
||||||
|
"""
|
||||||
|
self.vision_backend = vision_backend
|
||||||
|
|
||||||
|
def verify_state(
|
||||||
|
self,
|
||||||
|
screenshot_path: str,
|
||||||
|
expected_state: dict,
|
||||||
|
context: str = "",
|
||||||
|
game: str = "generic"
|
||||||
|
) -> VerificationResult:
|
||||||
|
"""
|
||||||
|
Verify a game screenshot matches expected state conditions.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
screenshot_path: Path to the screenshot file
|
||||||
|
expected_state: Dict of expected conditions, e.g.:
|
||||||
|
{
|
||||||
|
"location": "Balmora",
|
||||||
|
"health_above": 50,
|
||||||
|
"has_weapon": True,
|
||||||
|
"time_of_day": "day",
|
||||||
|
"nearby_npcs": ["Caius Cosades"]
|
||||||
|
}
|
||||||
|
context: Additional context for the vision model
|
||||||
|
game: Game name for context ("morrowind", "minecraft", "generic")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
VerificationResult with status, confidence, and details
|
||||||
|
"""
|
||||||
|
if not Path(screenshot_path).exists():
|
||||||
|
return VerificationResult(
|
||||||
|
status=VerificationStatus.ERROR,
|
||||||
|
verified=False,
|
||||||
|
confidence=0.0,
|
||||||
|
details=f"Screenshot not found: {screenshot_path}",
|
||||||
|
expected=expected_state,
|
||||||
|
screenshot_path=screenshot_path
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build verification prompt
|
||||||
|
prompt = self._build_prompt(expected_state, context, game)
|
||||||
|
|
||||||
|
# Analyze screenshot
|
||||||
|
analysis = self._analyze_screenshot(screenshot_path, prompt)
|
||||||
|
|
||||||
|
# Parse results
|
||||||
|
return self._parse_analysis(analysis, expected_state, screenshot_path)
|
||||||
|
|
||||||
|
def _build_prompt(self, expected: dict, context: str, game: str) -> str:
|
||||||
|
"""Build a structured verification prompt for the vision model."""
|
||||||
|
conditions = []
|
||||||
|
for key, value in expected.items():
|
||||||
|
if isinstance(value, bool):
|
||||||
|
conditions.append(f"- {key}: {'yes' if value else 'no'}")
|
||||||
|
elif isinstance(value, (int, float)):
|
||||||
|
conditions.append(f"- {key}: {value} or better")
|
||||||
|
elif isinstance(value, list):
|
||||||
|
conditions.append(f"- {key}: should include {', '.join(str(v) for v in value)}")
|
||||||
|
else:
|
||||||
|
conditions.append(f"- {key}: {value}")
|
||||||
|
|
||||||
|
prompt = f"""Analyze this {game} game screenshot and verify the following conditions:
|
||||||
|
|
||||||
|
{chr(10).join(conditions)}
|
||||||
|
|
||||||
|
Context: {context if context else 'No additional context provided.'}
|
||||||
|
|
||||||
|
For each condition, state VERIFIED, FAILED, or UNCERTAIN with a brief reason.
|
||||||
|
End with a JSON block:
|
||||||
|
```json
|
||||||
|
{{
|
||||||
|
"verified": true/false,
|
||||||
|
"confidence": 0.0-1.0,
|
||||||
|
"details": "brief summary",
|
||||||
|
"mismatches": ["list of failed conditions"]
|
||||||
|
}}
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
def _analyze_screenshot(self, path: str, prompt: str) -> str:
|
||||||
|
"""
|
||||||
|
Send screenshot to vision backend for analysis.
|
||||||
|
|
||||||
|
In a live agent context, this would call the MCP vision tool.
|
||||||
|
For standalone use, it returns the prompt for manual invocation.
|
||||||
|
"""
|
||||||
|
# Return structured prompt for the calling agent to process
|
||||||
|
return json.dumps({
|
||||||
|
"prompt": prompt,
|
||||||
|
"screenshot_path": str(path),
|
||||||
|
"instruction": "Use vision_analyze tool with this prompt and screenshot_path"
|
||||||
|
})
|
||||||
|
|
||||||
|
def _parse_analysis(
|
||||||
|
self, analysis: str, expected: dict, screenshot_path: str
|
||||||
|
) -> VerificationResult:
|
||||||
|
"""Parse vision analysis into a VerificationResult."""
|
||||||
|
try:
|
||||||
|
data = json.loads(analysis)
|
||||||
|
if "instruction" in data:
|
||||||
|
# Not yet analyzed - return pending
|
||||||
|
return VerificationResult(
|
||||||
|
status=VerificationStatus.UNCERTAIN,
|
||||||
|
verified=False,
|
||||||
|
confidence=0.0,
|
||||||
|
details=f"Pending analysis. Run: vision_analyze("{data['screenshot_path']}", "{data['prompt'][:100]}...")",
|
||||||
|
expected=expected,
|
||||||
|
screenshot_path=screenshot_path
|
||||||
|
)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Parse text analysis for JSON block
|
||||||
|
import re
|
||||||
|
json_match = re.search(r"```json\s*({.*?})\s*```", analysis, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
try:
|
||||||
|
result = json.loads(json_match.group(1))
|
||||||
|
status = VerificationStatus.VERIFIED if result.get("verified") else VerificationStatus.FAILED
|
||||||
|
return VerificationResult(
|
||||||
|
status=status,
|
||||||
|
verified=result.get("verified", False),
|
||||||
|
confidence=result.get("confidence", 0.0),
|
||||||
|
details=result.get("details", ""),
|
||||||
|
expected=expected,
|
||||||
|
mismatches=result.get("mismatches", []),
|
||||||
|
screenshot_path=screenshot_path
|
||||||
|
)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Fallback: return as uncertain
|
||||||
|
return VerificationResult(
|
||||||
|
status=VerificationStatus.UNCERTAIN,
|
||||||
|
verified=False,
|
||||||
|
confidence=0.3,
|
||||||
|
details=analysis[:500],
|
||||||
|
expected=expected,
|
||||||
|
screenshot_path=screenshot_path
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def morrowind_state(
|
||||||
|
location: Optional[str] = None,
|
||||||
|
health_min: Optional[int] = None,
|
||||||
|
has_weapon: Optional[bool] = None,
|
||||||
|
is_indoors: Optional[bool] = None,
|
||||||
|
time_of_day: Optional[str] = None,
|
||||||
|
nearby_npcs: Optional[list] = None,
|
||||||
|
**extra
|
||||||
|
) -> dict:
|
||||||
|
"""Build expected state dict for Morrowind."""
|
||||||
|
state = {}
|
||||||
|
if location:
|
||||||
|
state["location"] = location
|
||||||
|
if health_min is not None:
|
||||||
|
state["health_above"] = health_min
|
||||||
|
if has_weapon is not None:
|
||||||
|
state["has_weapon"] = has_weapon
|
||||||
|
if is_indoors is not None:
|
||||||
|
state["indoors"] = is_indoors
|
||||||
|
if time_of_day:
|
||||||
|
state["time_of_day"] = time_of_day
|
||||||
|
if nearby_npcs:
|
||||||
|
state["nearby_npcs"] = nearby_npcs
|
||||||
|
state.update(extra)
|
||||||
|
return state
|
||||||
|
|
||||||
|
|
||||||
|
# --- Example Verification Flows ---
|
||||||
|
|
||||||
|
EXAMPLE_MORROWIND_VERIFICATION = """
|
||||||
|
# Verify player is in Balmora with a weapon
|
||||||
|
verifier = VisualStateVerifier()
|
||||||
|
result = verifier.verify_state(
|
||||||
|
screenshot_path="/tmp/morrowind_screenshot.png",
|
||||||
|
expected_state=VisualStateVerifier.morrowind_state(
|
||||||
|
location="Balmora",
|
||||||
|
health_min=50,
|
||||||
|
has_weapon=True
|
||||||
|
),
|
||||||
|
context="After completing the first Caius Cosades quest",
|
||||||
|
game="morrowind"
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.verified:
|
||||||
|
print(f"State confirmed: {result.details}")
|
||||||
|
else:
|
||||||
|
print(f"State mismatch: {result.mismatches}")
|
||||||
|
"""
|
||||||
|
|
||||||
|
EXAMPLE_BATCH_VERIFICATION = """
|
||||||
|
# Verify multiple game states in sequence
|
||||||
|
states = [
|
||||||
|
{"screenshot": "screen1.png", "expected": {"location": "Seyda Neen"}, "context": "After character creation"},
|
||||||
|
{"screenshot": "screen2.png", "expected": {"location": "Balmora", "has_weapon": True}, "context": "After buying weapon"},
|
||||||
|
{"screenshot": "screen3.png", "expected": {"health_above": 80}, "context": "After resting"},
|
||||||
|
]
|
||||||
|
|
||||||
|
verifier = VisualStateVerifier()
|
||||||
|
for state in states:
|
||||||
|
result = verifier.verify_state(**state, game="morrowind")
|
||||||
|
print(f"{state['context']}: {'PASS' if result.verified else 'FAIL'} (confidence: {result.confidence:.0%})")
|
||||||
|
"""
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Demo: build and display a verification prompt
|
||||||
|
verifier = VisualStateVerifier()
|
||||||
|
expected = verifier.morrowind_state(
|
||||||
|
location="Balmora",
|
||||||
|
health_min=50,
|
||||||
|
has_weapon=True,
|
||||||
|
nearby_npcs=["Caius Cosades"]
|
||||||
|
)
|
||||||
|
result = verifier.verify_state(
|
||||||
|
screenshot_path="/tmp/demo_screenshot.png",
|
||||||
|
expected_state=expected,
|
||||||
|
context="Player should have completed the first quest",
|
||||||
|
game="morrowind"
|
||||||
|
)
|
||||||
|
print(result.details)
|
||||||
113
tests/test_visual_state_verifier.py
Normal file
113
tests/test_visual_state_verifier.py
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
"""Tests for visual state verification module."""
|
||||||
|
import json
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
# Add parent to path for import
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from scripts.visual_state_verifier import (
|
||||||
|
VisualStateVerifier,
|
||||||
|
VerificationResult,
|
||||||
|
VerificationStatus,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestVisualStateVerifier:
|
||||||
|
"""Test the visual state verifier."""
|
||||||
|
|
||||||
|
def test_missing_screenshot_returns_error(self):
|
||||||
|
verifier = VisualStateVerifier()
|
||||||
|
result = verifier.verify_state(
|
||||||
|
screenshot_path="/nonexistent/screenshot.png",
|
||||||
|
expected_state={"location": "Balmora"},
|
||||||
|
game="morrowind"
|
||||||
|
)
|
||||||
|
assert result.status == VerificationStatus.ERROR
|
||||||
|
assert not result.verified
|
||||||
|
assert "not found" in result.details.lower()
|
||||||
|
|
||||||
|
def test_morrowind_state_builder(self):
|
||||||
|
state = VisualStateVerifier.morrowind_state(
|
||||||
|
location="Balmora",
|
||||||
|
health_min=50,
|
||||||
|
has_weapon=True,
|
||||||
|
nearby_npcs=["Caius Cosades"]
|
||||||
|
)
|
||||||
|
assert state["location"] == "Balmora"
|
||||||
|
assert state["health_above"] == 50
|
||||||
|
assert state["has_weapon"] is True
|
||||||
|
assert state["nearby_npcs"] == ["Caius Cosades"]
|
||||||
|
|
||||||
|
def test_morrowind_state_minimal(self):
|
||||||
|
state = VisualStateVerifier.morrowind_state(location="Vivec")
|
||||||
|
assert state == {"location": "Vivec"}
|
||||||
|
|
||||||
|
def test_morrowind_state_with_extras(self):
|
||||||
|
state = VisualStateVerifier.morrowind_state(
|
||||||
|
location="Balmora",
|
||||||
|
quest_complete=True,
|
||||||
|
gold_min=1000
|
||||||
|
)
|
||||||
|
assert state["quest_complete"] is True
|
||||||
|
assert state["gold_min"] == 1000
|
||||||
|
|
||||||
|
def test_prompt_includes_conditions(self):
|
||||||
|
verifier = VisualStateVerifier()
|
||||||
|
expected = {"location": "Balmora", "health_above": 50}
|
||||||
|
prompt = verifier._build_prompt(expected, "Test context", "morrowind")
|
||||||
|
assert "Balmora" in prompt
|
||||||
|
assert "50" in prompt
|
||||||
|
assert "Test context" in prompt
|
||||||
|
assert "morrowind" in prompt
|
||||||
|
|
||||||
|
def test_parse_analysis_returns_pending_for_raw(self):
|
||||||
|
verifier = VisualStateVerifier()
|
||||||
|
raw_analysis = json.dumps({
|
||||||
|
"prompt": "test",
|
||||||
|
"screenshot_path": "/tmp/test.png",
|
||||||
|
"instruction": "Use vision_analyze"
|
||||||
|
})
|
||||||
|
result = verifier._parse_analysis(raw_analysis, {}, "/tmp/test.png")
|
||||||
|
assert result.status == VerificationStatus.UNCERTAIN
|
||||||
|
assert not result.verified
|
||||||
|
|
||||||
|
def test_parse_analysis_extracts_json(self):
|
||||||
|
verifier = VisualStateVerifier()
|
||||||
|
analysis = """
|
||||||
|
The player appears to be in Balmora.
|
||||||
|
Health looks good.
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"verified": true,
|
||||||
|
"confidence": 0.85,
|
||||||
|
"details": "Player is in Balmora with weapon equipped",
|
||||||
|
"mismatches": []
|
||||||
|
}
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
result = verifier._parse_analysis(analysis, {"location": "Balmora"}, "/tmp/test.png")
|
||||||
|
assert result.status == VerificationStatus.VERIFIED
|
||||||
|
assert result.verified
|
||||||
|
assert result.confidence == 0.85
|
||||||
|
assert result.mismatches == []
|
||||||
|
|
||||||
|
def test_parse_analysis_handles_failures(self):
|
||||||
|
verifier = VisualStateVerifier()
|
||||||
|
analysis = """
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"verified": false,
|
||||||
|
"confidence": 0.9,
|
||||||
|
"details": "Player is not in Balmora",
|
||||||
|
"mismatches": ["location"]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
result = verifier._parse_analysis(analysis, {"location": "Balmora"}, "/tmp/test.png")
|
||||||
|
assert result.status == VerificationStatus.FAILED
|
||||||
|
assert not result.verified
|
||||||
|
assert "location" in result.mismatches
|
||||||
Reference in New Issue
Block a user