[claude] Add content moderation pipeline (Llama Guard + game-context prompts) (#1056) (#1059)

2026-03-23 02:14:42 +00:00
parent 092c982341
commit 1697e55cdb
6 changed files with 1013 additions and 0 deletions
--- a/tests/infrastructure/test_moderation.py
+++ b/tests/infrastructure/test_moderation.py
@@ -0,0 +1,335 @@
+"""Tests for the content moderation pipeline."""
+
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from infrastructure.guards.moderation import (
+    ContentModerator,
+    GameProfile,
+    ModerationResult,
+    ModerationVerdict,
+    ViolationCategory,
+    _parse_guard_category,
+    get_moderator,
+)
+
+
+# ── Unit tests for data types ────────────────────────────────────────────────
+
+
+class TestModerationResult:
+    """Test ModerationResult dataclass."""
+
+    def test_passed_property_true(self):
+        result = ModerationResult(verdict=ModerationVerdict.PASS, blocked=False)
+        assert result.passed is True
+
+    def test_passed_property_false(self):
+        result = ModerationResult(verdict=ModerationVerdict.FAIL, blocked=True)
+        assert result.passed is False
+
+    def test_default_values(self):
+        result = ModerationResult(verdict=ModerationVerdict.PASS, blocked=False)
+        assert result.category == ViolationCategory.NONE
+        assert result.confidence == 0.0
+        assert result.fallback == ""
+        assert result.reason == ""
+
+
+class TestGameProfile:
+    """Test GameProfile dataclass."""
+
+    def test_default_values(self):
+        profile = GameProfile(game_id="test", display_name="Test Game")
+        assert profile.vocabulary_whitelist == []
+        assert profile.threshold == 0.8
+        assert profile.fallbacks == {}
+
+    def test_morrowind_profile(self):
+        profile = GameProfile(
+            game_id="morrowind",
+            display_name="Morrowind",
+            vocabulary_whitelist=["Skooma", "slave"],
+            threshold=0.85,
+        )
+        assert "Skooma" in profile.vocabulary_whitelist
+        assert profile.threshold == 0.85
+
+
+class TestParseGuardCategory:
+    """Test Llama Guard category parsing."""
+
+    def test_hate_speech(self):
+        assert _parse_guard_category("S1: Hate speech") == ViolationCategory.HATE_SPEECH
+
+    def test_violence(self):
+        assert _parse_guard_category("S2: Violence") == ViolationCategory.VIOLENCE_GLORIFICATION
+
+    def test_sexual_content(self):
+        assert _parse_guard_category("S3: Sexual content") == ViolationCategory.SEXUAL_CONTENT
+
+    def test_self_harm(self):
+        assert _parse_guard_category("S4: Self-harm") == ViolationCategory.SELF_HARM
+
+    def test_dangerous(self):
+        assert _parse_guard_category("S5: Dangerous activity") == ViolationCategory.REAL_WORLD_HARM
+
+    def test_unknown_category(self):
+        assert _parse_guard_category("S99: Unknown") == ViolationCategory.NONE
+
+
+# ── ContentModerator tests ───────────────────────────────────────────────────
+
+
+class TestContentModerator:
+    """Test the content moderation pipeline."""
+
+    def _make_moderator(self, **kwargs) -> ContentModerator:
+        """Create a moderator with test defaults."""
+        profiles = {
+            "morrowind": GameProfile(
+                game_id="morrowind",
+                display_name="Morrowind",
+                vocabulary_whitelist=["Skooma", "Moon Sugar", "slave", "Morag Tong"],
+                context_prompt="Narrate Morrowind gameplay.",
+                threshold=0.85,
+                fallbacks={
+                    "combat": "The battle continues.",
+                    "default": "The adventure continues.",
+                },
+            ),
+            "default": GameProfile(
+                game_id="default",
+                display_name="Generic",
+                vocabulary_whitelist=[],
+                context_prompt="Narrate gameplay.",
+                threshold=0.8,
+                fallbacks={"default": "Gameplay continues."},
+            ),
+        }
+        return ContentModerator(profiles=profiles, **kwargs)
+
+    def test_get_profile_known_game(self):
+        mod = self._make_moderator()
+        profile = mod.get_profile("morrowind")
+        assert profile.game_id == "morrowind"
+
+    def test_get_profile_unknown_game_falls_back(self):
+        mod = self._make_moderator()
+        profile = mod.get_profile("unknown_game")
+        assert profile.game_id == "default"
+
+    def test_get_context_prompt(self):
+        mod = self._make_moderator()
+        prompt = mod.get_context_prompt("morrowind")
+        assert "Morrowind" in prompt
+
+    def test_register_profile(self):
+        mod = self._make_moderator()
+        new_profile = GameProfile(game_id="skyrim", display_name="Skyrim")
+        mod.register_profile(new_profile)
+        assert mod.get_profile("skyrim").game_id == "skyrim"
+
+    def test_whitelist_replaces_game_terms(self):
+        mod = self._make_moderator()
+        profile = mod.get_profile("morrowind")
+        cleaned = mod._apply_whitelist(
+            "The merchant sells Skooma and Moon Sugar in the slave market.",
+            profile,
+        )
+        assert "Skooma" not in cleaned
+        assert "Moon Sugar" not in cleaned
+        assert "slave" not in cleaned
+        assert "[GAME_TERM]" in cleaned
+
+    def test_whitelist_case_insensitive(self):
+        mod = self._make_moderator()
+        profile = mod.get_profile("morrowind")
+        cleaned = mod._apply_whitelist("skooma and SKOOMA", profile)
+        assert "skooma" not in cleaned
+        assert "SKOOMA" not in cleaned
+
+    @pytest.mark.asyncio
+    async def test_check_safe_content_passes(self):
+        """Safe content should pass moderation."""
+        mod = self._make_moderator()
+        with patch.object(mod, "_is_guard_available", new_callable=AsyncMock, return_value=False):
+            result = await mod.check("The player walks through the town.", game="morrowind")
+        assert result.passed
+        assert not result.blocked
+
+    @pytest.mark.asyncio
+    async def test_check_blocked_content_has_fallback(self):
+        """Blocked content should include scene-appropriate fallback."""
+        mod = self._make_moderator()
+        # Force a block via regex by using real-world harm language
+        text = "In real life you should attack and hurt people"
+        with patch.object(mod, "_is_guard_available", new_callable=AsyncMock, return_value=False):
+            result = await mod.check(text, game="morrowind", scene_type="combat")
+        assert result.blocked
+        assert result.fallback == "The battle continues."
+
+    @pytest.mark.asyncio
+    async def test_check_with_moderation_disabled(self):
+        """When moderation is disabled, everything passes."""
+        mod = self._make_moderator()
+        with patch("infrastructure.guards.moderation.settings") as mock_settings:
+            mock_settings.moderation_enabled = False
+            mock_settings.moderation_guard_model = "llama-guard3:1b"
+            mock_settings.normalized_ollama_url = "http://127.0.0.1:11434"
+            result = await mod.check("anything goes here")
+        assert result.passed
+        assert result.layer == "disabled"
+
+    @pytest.mark.asyncio
+    async def test_threshold_below_allows_content(self):
+        """Content flagged below threshold should pass through (Layer 3)."""
+        mod = self._make_moderator()
+        # Mock the guard to return a low-confidence flag
+        low_conf_result = ModerationResult(
+            verdict=ModerationVerdict.FAIL,
+            blocked=True,
+            confidence=0.5,  # Below morrowind threshold of 0.85
+            layer="llama_guard",
+            category=ViolationCategory.VIOLENCE_GLORIFICATION,
+        )
+        with patch.object(
+            mod, "_run_guard", new_callable=AsyncMock, return_value=low_conf_result
+        ):
+            result = await mod.check("sword fight scene", game="morrowind")
+        assert result.passed
+        assert not result.blocked
+        assert result.layer == "threshold"
+
+    @pytest.mark.asyncio
+    async def test_threshold_above_blocks_content(self):
+        """Content flagged above threshold should remain blocked."""
+        mod = self._make_moderator()
+        high_conf_result = ModerationResult(
+            verdict=ModerationVerdict.FAIL,
+            blocked=True,
+            confidence=0.95,  # Above morrowind threshold of 0.85
+            layer="llama_guard",
+            category=ViolationCategory.REAL_WORLD_HARM,
+        )
+        with patch.object(
+            mod, "_run_guard", new_callable=AsyncMock, return_value=high_conf_result
+        ):
+            result = await mod.check("harmful content", game="morrowind")
+        assert result.blocked
+
+    def test_regex_catches_real_world_harm(self):
+        """Regex fallback should catch obvious real-world harm patterns."""
+        mod = self._make_moderator()
+        result = mod._check_with_regex("you should actually harm real people")
+        assert result.blocked
+        assert result.category == ViolationCategory.REAL_WORLD_HARM
+        assert result.layer == "regex_fallback"
+
+    def test_regex_passes_game_violence(self):
+        """Regex should not flag in-game violence narration."""
+        mod = self._make_moderator()
+        result = mod._check_with_regex(
+            "The warrior slays the dragon with a mighty blow."
+        )
+        assert result.passed
+
+    def test_regex_passes_normal_narration(self):
+        """Normal narration should pass regex checks."""
+        mod = self._make_moderator()
+        result = mod._check_with_regex(
+            "The Nerevarine enters the city of Balmora and speaks with Caius Cosades."
+        )
+        assert result.passed
+
+    def test_metrics_tracking(self):
+        """Metrics should track checks accurately."""
+        mod = self._make_moderator()
+        assert mod.get_metrics()["total_checks"] == 0
+
+    @pytest.mark.asyncio
+    async def test_metrics_increment_after_check(self):
+        """Metrics should increment after moderation checks."""
+        mod = self._make_moderator()
+        with patch.object(mod, "_is_guard_available", new_callable=AsyncMock, return_value=False):
+            await mod.check("safe text", game="default")
+        metrics = mod.get_metrics()
+        assert metrics["total_checks"] == 1
+        assert metrics["passed"] == 1
+
+    @pytest.mark.asyncio
+    async def test_guard_fallback_on_error(self):
+        """Should fall back to regex when guard model errors."""
+        mod = self._make_moderator()
+        with patch.object(
+            mod, "_is_guard_available", new_callable=AsyncMock, return_value=True
+        ), patch.object(
+            mod, "_check_with_guard", new_callable=AsyncMock, side_effect=RuntimeError("timeout")
+        ):
+            result = await mod.check("safe text", game="default")
+        # Should fall back to regex and pass
+        assert result.passed
+        assert result.layer == "regex_fallback"
+
+
+class TestGetModerator:
+    """Test the singleton accessor."""
+
+    def test_returns_same_instance(self):
+        """get_moderator should return the same instance."""
+        # Reset the global to test fresh
+        import infrastructure.guards.moderation as mod_module
+
+        mod_module._moderator = None
+        m1 = get_moderator()
+        m2 = get_moderator()
+        assert m1 is m2
+        # Clean up
+        mod_module._moderator = None
+
+
+# ── Profile loader tests ────────────────────────────────────────────────────
+
+
+class TestProfileLoader:
+    """Test YAML profile loading."""
+
+    def test_load_missing_file_returns_empty(self, tmp_path):
+        from infrastructure.guards.profiles import load_profiles
+
+        result = load_profiles(tmp_path / "nonexistent.yaml")
+        assert result == {}
+
+    def test_load_valid_config(self, tmp_path):
+        import yaml
+
+        from infrastructure.guards.profiles import load_profiles
+
+        config = {
+            "profiles": {
+                "testgame": {
+                    "display_name": "Test Game",
+                    "threshold": 0.9,
+                    "vocabulary_whitelist": ["sword", "potion"],
+                    "context_prompt": "Narrate test game.",
+                    "fallbacks": {"default": "Game continues."},
+                }
+            }
+        }
+        config_file = tmp_path / "moderation.yaml"
+        config_file.write_text(yaml.dump(config))
+
+        profiles = load_profiles(config_file)
+        assert "testgame" in profiles
+        assert profiles["testgame"].threshold == 0.9
+        assert "sword" in profiles["testgame"].vocabulary_whitelist
+
+    def test_load_malformed_yaml_returns_empty(self, tmp_path):
+        from infrastructure.guards.profiles import load_profiles
+
+        config_file = tmp_path / "moderation.yaml"
+        config_file.write_text("{{{{invalid yaml")
+
+        result = load_profiles(config_file)
+        assert result == {}