[gemma-4-multimodal] Vision-Based State Verification for Morrowind Agent #1487

Open
Rockachopa wants to merge 2 commits from gemma4-worker-20260409-105205-1482 into main
2 changed files with 398 additions and 0 deletions

View File

@@ -0,0 +1,285 @@
"""
Visual State Verification Module for Game Agents
=================================================
Provides screenshot-based environmental state verification for game agents
(Morrowind, Minecraft, or any game with a screenshot API). Uses multimodal
analysis to confirm agent expectations match actual game state.
Usage:
from scripts.visual_state_verifier import VisualStateVerifier
verifier = VisualStateVerifier()
result = verifier.verify_state(
screenshot_path="/tmp/game_screenshot.png",
expected_state={"location": "Balmora", "health_above": 50, "has_weapon": True},
context="Player should be in Balmora with a weapon equipped"
)
print(result.verified) # True/False
print(result.details) # Human-readable analysis
"""
import json
import os
import subprocess
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Optional
class VerificationStatus(Enum):
"""Status of a visual state verification."""
VERIFIED = "verified"
FAILED = "failed"
UNCERTAIN = "uncertain"
ERROR = "error"
@dataclass
class VerificationResult:
"""Result of a visual state verification."""
status: VerificationStatus
verified: bool
confidence: float # 0.0 - 1.0
details: str
expected: dict
observed: dict = field(default_factory=dict)
mismatches: list = field(default_factory=list)
screenshot_path: Optional[str] = None
class VisualStateVerifier:
"""
Verifies game state by analyzing screenshots against expected conditions.
Supports any game that can produce screenshots. Designed for integration
with MCP screenshot tools and vision analysis capabilities.
"""
def __init__(self, vision_backend: str = "builtin"):
"""
Args:
vision_backend: "builtin" for MCP vision, "ollama" for local model
"""
self.vision_backend = vision_backend
def verify_state(
self,
screenshot_path: str,
expected_state: dict,
context: str = "",
game: str = "generic"
) -> VerificationResult:
"""
Verify a game screenshot matches expected state conditions.
Args:
screenshot_path: Path to the screenshot file
expected_state: Dict of expected conditions, e.g.:
{
"location": "Balmora",
"health_above": 50,
"has_weapon": True,
"time_of_day": "day",
"nearby_npcs": ["Caius Cosades"]
}
context: Additional context for the vision model
game: Game name for context ("morrowind", "minecraft", "generic")
Returns:
VerificationResult with status, confidence, and details
"""
if not Path(screenshot_path).exists():
return VerificationResult(
status=VerificationStatus.ERROR,
verified=False,
confidence=0.0,
details=f"Screenshot not found: {screenshot_path}",
expected=expected_state,
screenshot_path=screenshot_path
)
# Build verification prompt
prompt = self._build_prompt(expected_state, context, game)
# Analyze screenshot
analysis = self._analyze_screenshot(screenshot_path, prompt)
# Parse results
return self._parse_analysis(analysis, expected_state, screenshot_path)
def _build_prompt(self, expected: dict, context: str, game: str) -> str:
"""Build a structured verification prompt for the vision model."""
conditions = []
for key, value in expected.items():
if isinstance(value, bool):
conditions.append(f"- {key}: {'yes' if value else 'no'}")
elif isinstance(value, (int, float)):
conditions.append(f"- {key}: {value} or better")
elif isinstance(value, list):
conditions.append(f"- {key}: should include {', '.join(str(v) for v in value)}")
else:
conditions.append(f"- {key}: {value}")
prompt = f"""Analyze this {game} game screenshot and verify the following conditions:
{chr(10).join(conditions)}
Context: {context if context else 'No additional context provided.'}
For each condition, state VERIFIED, FAILED, or UNCERTAIN with a brief reason.
End with a JSON block:
```json
{{
"verified": true/false,
"confidence": 0.0-1.0,
"details": "brief summary",
"mismatches": ["list of failed conditions"]
}}
```
"""
return prompt
def _analyze_screenshot(self, path: str, prompt: str) -> str:
"""
Send screenshot to vision backend for analysis.
In a live agent context, this would call the MCP vision tool.
For standalone use, it returns the prompt for manual invocation.
"""
# Return structured prompt for the calling agent to process
return json.dumps({
"prompt": prompt,
"screenshot_path": str(path),
"instruction": "Use vision_analyze tool with this prompt and screenshot_path"
})
def _parse_analysis(
self, analysis: str, expected: dict, screenshot_path: str
) -> VerificationResult:
"""Parse vision analysis into a VerificationResult."""
try:
data = json.loads(analysis)
if "instruction" in data:
# Not yet analyzed - return pending
return VerificationResult(
status=VerificationStatus.UNCERTAIN,
verified=False,
confidence=0.0,
details=f"Pending analysis. Run: vision_analyze("{data['screenshot_path']}", "{data['prompt'][:100]}...")",
expected=expected,
screenshot_path=screenshot_path
)
except json.JSONDecodeError:
pass
# Parse text analysis for JSON block
import re
json_match = re.search(r"```json\s*({.*?})\s*```", analysis, re.DOTALL)
if json_match:
try:
result = json.loads(json_match.group(1))
status = VerificationStatus.VERIFIED if result.get("verified") else VerificationStatus.FAILED
return VerificationResult(
status=status,
verified=result.get("verified", False),
confidence=result.get("confidence", 0.0),
details=result.get("details", ""),
expected=expected,
mismatches=result.get("mismatches", []),
screenshot_path=screenshot_path
)
except json.JSONDecodeError:
pass
# Fallback: return as uncertain
return VerificationResult(
status=VerificationStatus.UNCERTAIN,
verified=False,
confidence=0.3,
details=analysis[:500],
expected=expected,
screenshot_path=screenshot_path
)
@staticmethod
def morrowind_state(
location: Optional[str] = None,
health_min: Optional[int] = None,
has_weapon: Optional[bool] = None,
is_indoors: Optional[bool] = None,
time_of_day: Optional[str] = None,
nearby_npcs: Optional[list] = None,
**extra
) -> dict:
"""Build expected state dict for Morrowind."""
state = {}
if location:
state["location"] = location
if health_min is not None:
state["health_above"] = health_min
if has_weapon is not None:
state["has_weapon"] = has_weapon
if is_indoors is not None:
state["indoors"] = is_indoors
if time_of_day:
state["time_of_day"] = time_of_day
if nearby_npcs:
state["nearby_npcs"] = nearby_npcs
state.update(extra)
return state
# --- Example Verification Flows ---
EXAMPLE_MORROWIND_VERIFICATION = """
# Verify player is in Balmora with a weapon
verifier = VisualStateVerifier()
result = verifier.verify_state(
screenshot_path="/tmp/morrowind_screenshot.png",
expected_state=VisualStateVerifier.morrowind_state(
location="Balmora",
health_min=50,
has_weapon=True
),
context="After completing the first Caius Cosades quest",
game="morrowind"
)
if result.verified:
print(f"State confirmed: {result.details}")
else:
print(f"State mismatch: {result.mismatches}")
"""
EXAMPLE_BATCH_VERIFICATION = """
# Verify multiple game states in sequence
states = [
{"screenshot": "screen1.png", "expected": {"location": "Seyda Neen"}, "context": "After character creation"},
{"screenshot": "screen2.png", "expected": {"location": "Balmora", "has_weapon": True}, "context": "After buying weapon"},
{"screenshot": "screen3.png", "expected": {"health_above": 80}, "context": "After resting"},
]
verifier = VisualStateVerifier()
for state in states:
result = verifier.verify_state(**state, game="morrowind")
print(f"{state['context']}: {'PASS' if result.verified else 'FAIL'} (confidence: {result.confidence:.0%})")
"""
if __name__ == "__main__":
# Demo: build and display a verification prompt
verifier = VisualStateVerifier()
expected = verifier.morrowind_state(
location="Balmora",
health_min=50,
has_weapon=True,
nearby_npcs=["Caius Cosades"]
)
result = verifier.verify_state(
screenshot_path="/tmp/demo_screenshot.png",
expected_state=expected,
context="Player should have completed the first quest",
game="morrowind"
)
print(result.details)

View File

@@ -0,0 +1,113 @@
"""Tests for visual state verification module."""
import json
import tempfile
from pathlib import Path
import pytest
# Add parent to path for import
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))
from scripts.visual_state_verifier import (
VisualStateVerifier,
VerificationResult,
VerificationStatus,
)
class TestVisualStateVerifier:
"""Test the visual state verifier."""
def test_missing_screenshot_returns_error(self):
verifier = VisualStateVerifier()
result = verifier.verify_state(
screenshot_path="/nonexistent/screenshot.png",
expected_state={"location": "Balmora"},
game="morrowind"
)
assert result.status == VerificationStatus.ERROR
assert not result.verified
assert "not found" in result.details.lower()
def test_morrowind_state_builder(self):
state = VisualStateVerifier.morrowind_state(
location="Balmora",
health_min=50,
has_weapon=True,
nearby_npcs=["Caius Cosades"]
)
assert state["location"] == "Balmora"
assert state["health_above"] == 50
assert state["has_weapon"] is True
assert state["nearby_npcs"] == ["Caius Cosades"]
def test_morrowind_state_minimal(self):
state = VisualStateVerifier.morrowind_state(location="Vivec")
assert state == {"location": "Vivec"}
def test_morrowind_state_with_extras(self):
state = VisualStateVerifier.morrowind_state(
location="Balmora",
quest_complete=True,
gold_min=1000
)
assert state["quest_complete"] is True
assert state["gold_min"] == 1000
def test_prompt_includes_conditions(self):
verifier = VisualStateVerifier()
expected = {"location": "Balmora", "health_above": 50}
prompt = verifier._build_prompt(expected, "Test context", "morrowind")
assert "Balmora" in prompt
assert "50" in prompt
assert "Test context" in prompt
assert "morrowind" in prompt
def test_parse_analysis_returns_pending_for_raw(self):
verifier = VisualStateVerifier()
raw_analysis = json.dumps({
"prompt": "test",
"screenshot_path": "/tmp/test.png",
"instruction": "Use vision_analyze"
})
result = verifier._parse_analysis(raw_analysis, {}, "/tmp/test.png")
assert result.status == VerificationStatus.UNCERTAIN
assert not result.verified
def test_parse_analysis_extracts_json(self):
verifier = VisualStateVerifier()
analysis = """
The player appears to be in Balmora.
Health looks good.
```json
{
"verified": true,
"confidence": 0.85,
"details": "Player is in Balmora with weapon equipped",
"mismatches": []
}
```
"""
result = verifier._parse_analysis(analysis, {"location": "Balmora"}, "/tmp/test.png")
assert result.status == VerificationStatus.VERIFIED
assert result.verified
assert result.confidence == 0.85
assert result.mismatches == []
def test_parse_analysis_handles_failures(self):
verifier = VisualStateVerifier()
analysis = """
```json
{
"verified": false,
"confidence": 0.9,
"details": "Player is not in Balmora",
"mismatches": ["location"]
}
```
"""
result = verifier._parse_analysis(analysis, {"location": "Balmora"}, "/tmp/test.png")
assert result.status == VerificationStatus.FAILED
assert not result.verified
assert "location" in result.mismatches