Files
Timmy-time-dashboard/tests/infrastructure/test_moderation.py
Alexander Whitestone 3e5a3ac05f
Some checks failed
Tests / test (pull_request) Has been skipped
Tests / lint (pull_request) Failing after 14s
feat: add content moderation pipeline (Llama Guard + game-context profiles)
Implement real-time content moderation for narration output using a
local safety model (Llama Guard 3 via Ollama). The pipeline is designed
to run in parallel with TTS preprocessing for near-zero added latency.

Key components:
- ContentModerator singleton with async check() method
- Game-context profiles (Morrowind vocabulary whitelist, fallback narrations)
- Configurable fail-open/fail-closed degradation when model unavailable
- Llama Guard response parsing (safe/unsafe with category codes)
- 40 unit tests covering profiles, parsing, whitelist, and async checks

Config settings: moderation_enabled, moderation_model, moderation_timeout_ms,
moderation_fail_open, moderation_game_profile

Fixes #987

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 20:43:22 -04:00

357 lines
12 KiB
Python

"""Tests for content moderation pipeline."""
from unittest.mock import MagicMock, patch
import pytest
from infrastructure.moderation.guard import (
ContentModerator,
ModerationResult,
_build_prompt,
_contains_whitelisted_only,
_parse_response,
)
from infrastructure.moderation.profiles import (
GENERIC,
MORROWIND,
PROFILES,
GameProfile,
get_profile,
)
# ── Profile tests ──────────────────────────────────────────────────────────
class TestGameProfiles:
"""Test game-context moderation profiles."""
def test_morrowind_profile_has_expected_terms(self):
assert "skooma" in MORROWIND.whitelisted_terms
assert "slave" in MORROWIND.whitelisted_terms
assert "morag tong" in MORROWIND.whitelisted_terms
assert "n'wah" in MORROWIND.whitelisted_terms
def test_morrowind_has_fallback_narrations(self):
assert "combat" in MORROWIND.fallback_narrations
assert "dialogue" in MORROWIND.fallback_narrations
assert "default" in MORROWIND.fallback_narrations
def test_morrowind_context_prompt_exists(self):
assert "Morrowind" in MORROWIND.context_prompt
assert "Skooma" in MORROWIND.context_prompt
def test_generic_profile_has_empty_whitelist(self):
assert len(GENERIC.whitelisted_terms) == 0
def test_get_profile_returns_morrowind(self):
profile = get_profile("morrowind")
assert profile.name == "morrowind"
def test_get_profile_case_insensitive(self):
profile = get_profile("MORROWIND")
assert profile.name == "morrowind"
def test_get_profile_unknown_returns_generic(self):
profile = get_profile("unknown_game")
assert profile.name == "generic"
def test_profiles_registry(self):
assert "morrowind" in PROFILES
assert "generic" in PROFILES
# ── Response parsing tests ─────────────────────────────────────────────────
class TestParseResponse:
"""Test Llama Guard response parsing."""
def test_safe_response(self):
safe, cats = _parse_response("safe")
assert safe is True
assert cats == []
def test_safe_with_whitespace(self):
safe, cats = _parse_response(" safe \n")
assert safe is True
def test_unsafe_with_category(self):
safe, cats = _parse_response("unsafe\nS1")
assert safe is False
assert "s1" in cats
def test_unsafe_multiple_categories(self):
safe, cats = _parse_response("unsafe\nS1\nS6")
assert safe is False
assert len(cats) == 2
def test_unsafe_no_category(self):
safe, cats = _parse_response("unsafe")
assert safe is False
assert cats == ["unspecified"]
def test_empty_response_treated_as_unsafe(self):
safe, cats = _parse_response("")
assert safe is False
# ── Prompt building tests ──────────────────────────────────────────────────
class TestBuildPrompt:
"""Test moderation prompt construction."""
def test_includes_content(self):
prompt = _build_prompt("The Khajiit sells Skooma.", MORROWIND)
assert "The Khajiit sells Skooma." in prompt
def test_includes_game_context(self):
prompt = _build_prompt("test", MORROWIND)
assert "Morrowind" in prompt
def test_includes_task_instruction(self):
prompt = _build_prompt("test", GENERIC)
assert "safe or unsafe" in prompt
def test_generic_has_no_context_section_when_empty(self):
empty_profile = GameProfile(name="empty")
prompt = _build_prompt("test", empty_profile)
assert "[CONTEXT]" not in prompt
# ── Whitelist check tests ─────────────────────────────────────────────────
class TestWhitelistCheck:
"""Test game-context whitelist matching."""
def test_whitelisted_term_detected(self):
assert _contains_whitelisted_only(
"The merchant sells Skooma", MORROWIND
)
def test_case_insensitive(self):
assert _contains_whitelisted_only("SKOOMA dealer", MORROWIND)
def test_no_whitelist_terms(self):
assert not _contains_whitelisted_only(
"A beautiful sunset", MORROWIND
)
def test_empty_whitelist(self):
assert not _contains_whitelisted_only("skooma", GENERIC)
def test_multi_word_term(self):
assert _contains_whitelisted_only(
"Beware the Morag Tong", MORROWIND
)
# ── ModerationResult tests ────────────────────────────────────────────────
class TestModerationResult:
"""Test ModerationResult dataclass."""
def test_safe_result(self):
result = ModerationResult(safe=True, original_text="hello")
assert result.safe
assert result.fallback_text == ""
assert result.flagged_categories == []
def test_unsafe_result(self):
result = ModerationResult(
safe=False,
original_text="bad content",
flagged_categories=["S1"],
fallback_text="The journey continues.",
)
assert not result.safe
assert result.fallback_text == "The journey continues."
# ── ContentModerator tests ─────────────────────────────────────────────────
class TestContentModerator:
"""Test the ContentModerator class."""
def test_init_defaults(self):
mod = ContentModerator()
assert mod.profile.name == "morrowind"
assert mod._fail_open is True
def test_set_profile(self):
mod = ContentModerator()
mod.set_profile("generic")
assert mod.profile.name == "generic"
def test_get_fallback_default(self):
mod = ContentModerator()
fallback = mod.get_fallback()
assert isinstance(fallback, str)
assert len(fallback) > 0
def test_get_fallback_combat(self):
mod = ContentModerator()
fallback = mod.get_fallback("combat")
assert "battle" in fallback.lower() or "steel" in fallback.lower()
def test_get_fallback_unknown_scene(self):
mod = ContentModerator()
fallback = mod.get_fallback("unknown_scene_type")
# Should return the default fallback
assert isinstance(fallback, str)
@pytest.mark.asyncio
async def test_check_empty_text(self):
mod = ContentModerator()
result = await mod.check("")
assert result.safe is True
assert result.checked is False
@pytest.mark.asyncio
async def test_check_whitespace_only(self):
mod = ContentModerator()
result = await mod.check(" ")
assert result.safe is True
assert result.checked is False
@pytest.mark.asyncio
async def test_check_whitelisted_content_skips_model(self):
mod = ContentModerator()
result = await mod.check("The merchant sells Skooma in Balmora")
# Should be whitelisted without calling the model
assert result.safe is True
assert result.whitelisted is True
assert result.checked is False
@pytest.mark.asyncio
async def test_check_fail_open_on_error(self):
"""When Ollama is unavailable and fail_open=True, content passes."""
mod = ContentModerator(
ollama_url="http://127.0.0.1:99999", # unreachable
fail_open=True,
timeout_ms=100,
)
result = await mod.check("Some narration text here")
assert result.safe is True
assert result.checked is False
@pytest.mark.asyncio
async def test_check_fail_closed_on_error(self):
"""When Ollama is unavailable and fail_open=False, fallback is used."""
mod = ContentModerator(
ollama_url="http://127.0.0.1:99999",
fail_open=False,
timeout_ms=100,
)
result = await mod.check("Some narration text here", scene_type="combat")
assert result.safe is False
assert result.checked is False
assert len(result.fallback_text) > 0
@pytest.mark.asyncio
async def test_check_safe_content(self):
"""Mock Ollama returning safe verdict."""
mod = ContentModerator()
with patch(
"infrastructure.moderation.guard._call_ollama_sync",
return_value=(True, [], 15.0),
):
result = await mod.check("A peaceful morning in Seyda Neen.")
assert result.safe is True
assert result.latency_ms == 15.0
@pytest.mark.asyncio
async def test_check_unsafe_content_with_fallback(self):
"""Mock Ollama returning unsafe verdict — fallback should be used."""
mod = ContentModerator()
with patch(
"infrastructure.moderation.guard._call_ollama_sync",
return_value=(False, ["S1"], 20.0),
):
result = await mod.check(
"Extremely inappropriate content here",
scene_type="exploration",
)
assert result.safe is False
assert result.flagged_categories == ["S1"]
assert len(result.fallback_text) > 0
@pytest.mark.asyncio
async def test_check_unsafe_but_whitelisted(self):
"""Model flags content but game whitelist overrides.
We need a term that won't match the pre-call whitelist shortcut
but will match the post-call whitelist check. Use a profile where
the whitelist term is present but not the *only* content.
"""
# Build a custom profile where "skooma" is whitelisted
profile = GameProfile(
name="test",
whitelisted_terms=frozenset({"ancient ritual"}),
context_prompt="test",
fallback_narrations={"default": "fallback"},
)
mod = ContentModerator()
mod._profile = profile
# Text contains the whitelisted term but also other content,
# so the pre-check shortcut triggers — model is never called.
# Instead, test the post-model whitelist path by patching
# _contains_whitelisted_only to return False first, True second.
call_count = {"n": 0}
orig_fn = _contains_whitelisted_only
def _side_effect(text, prof):
call_count["n"] += 1
if call_count["n"] == 1:
return False # first call: don't shortcut
return True # second call: whitelist override
with patch(
"infrastructure.moderation.guard._call_ollama_sync",
return_value=(False, ["S6"], 18.0),
), patch(
"infrastructure.moderation.guard._contains_whitelisted_only",
side_effect=_side_effect,
):
result = await mod.check("The ancient ritual of Skooma brewing")
assert result.safe is True
assert result.whitelisted is True
assert result.flagged_categories == ["S6"]
# ── Singleton tests ────────────────────────────────────────────────────────
class TestGetModerator:
"""Test the get_moderator singleton."""
def test_get_moderator_returns_instance(self):
import infrastructure.moderation.guard as guard_mod
# Reset singleton for isolation
guard_mod._moderator = None
try:
from infrastructure.moderation import get_moderator
mod = get_moderator()
assert isinstance(mod, ContentModerator)
finally:
guard_mod._moderator = None
def test_get_moderator_returns_same_instance(self):
import infrastructure.moderation.guard as guard_mod
guard_mod._moderator = None
try:
from infrastructure.moderation import get_moderator
mod1 = get_moderator()
mod2 = get_moderator()
assert mod1 is mod2
finally:
guard_mod._moderator = None