Files
hermes-agent/tests/agent/test_conscience_mapping.py
Allegro 6c342e9e0f test(security): Add comprehensive tests for conscience enforcement
- Tests for conscience_mapping SOUL principles
- Tests for input_sanitizer threat detection
- Tests for conscience_enforcement integration
- Tests for identity and honesty enforcement
- Tests for conscience_validator tool
2026-04-05 11:37:54 +00:00

251 lines
9.9 KiB
Python

"""Tests for conscience_mapping module - SOUL.md principle mappings."""
import pytest
from agent.conscience_mapping import (
CrisisType,
RefusalCategory,
SOULPrinciple,
detect_crisis_type,
get_all_principles,
get_crisis_response,
get_principle_by_category,
)
# -----------------------------------------------------------------------------
# SOUL Principle Tests
# -----------------------------------------------------------------------------
class TestSOULPrinciples:
"""Tests for SOUL.md principle definitions."""
def test_principles_exist(self):
"""All expected SOUL principles should be defined."""
principles = get_all_principles()
assert "deception" in principles
assert "weapons" in principles
assert "child_safety" in principles
assert "coercion" in principles
def test_deception_principle_structure(self):
"""Deception principle has correct structure."""
principles = get_all_principles()
deception = principles["deception"]
assert deception.name == "Deception Refusal"
assert deception.refusal_category == RefusalCategory.DECEPTION
assert len(deception.keywords) > 0
assert "lie" in deception.keywords
assert "deceive" in deception.keywords
def test_weapons_principle_structure(self):
"""Weapons principle has correct structure."""
principles = get_all_principles()
weapons = principles["weapons"]
assert weapons.name == "Weapons and Harm Refusal"
assert weapons.refusal_category == RefusalCategory.WEAPONS
assert "weapon" in weapons.keywords
assert "bomb" in weapons.keywords
def test_child_safety_principle_structure(self):
"""Child safety principle has correct structure."""
principles = get_all_principles()
child = principles["child_safety"]
assert child.name == "Child Safety Refusal"
assert child.refusal_category == RefusalCategory.CHILD_SAFETY
assert "child" in child.keywords
def test_coercion_principle_structure(self):
"""Coercion principle has correct structure."""
principles = get_all_principles()
coercion = principles["coercion"]
assert coercion.name == "Coercion and Control Refusal"
assert coercion.refusal_category == RefusalCategory.COERCION
assert "blackmail" in coercion.keywords
def test_all_principles_have_response_templates(self):
"""Every principle must have a non-empty response template."""
principles = get_all_principles()
for name, principle in principles.items():
assert principle.response_template, f"{name} missing response template"
assert len(principle.response_template) > 20
def test_all_principles_have_keywords(self):
"""Every principle must have detection keywords."""
principles = get_all_principles()
for name, principle in principles.items():
assert len(principle.keywords) > 0, f"{name} has no keywords"
class TestGetPrincipleByCategory:
"""Tests for retrieving principles by category."""
def test_get_deception_by_category(self):
"""Can retrieve deception principle by category."""
principle = get_principle_by_category(RefusalCategory.DECEPTION)
assert principle is not None
assert principle.name == "Deception Refusal"
def test_get_weapons_by_category(self):
"""Can retrieve weapons principle by category."""
principle = get_principle_by_category(RefusalCategory.WEAPONS)
assert principle is not None
assert principle.name == "Weapons and Harm Refusal"
def test_get_child_safety_by_category(self):
"""Can retrieve child safety principle by category."""
principle = get_principle_by_category(RefusalCategory.CHILD_SAFETY)
assert principle is not None
assert principle.name == "Child Safety Refusal"
def test_get_coercion_by_category(self):
"""Can retrieve coercion principle by category."""
principle = get_principle_by_category(RefusalCategory.COERCION)
assert principle is not None
assert principle.name == "Coercion and Control Refusal"
# -----------------------------------------------------------------------------
# Crisis Detection Tests
# -----------------------------------------------------------------------------
class TestCrisisResponseDefinitions:
"""Tests for crisis response definitions."""
def test_suicide_crisis_response_exists(self):
"""Suicide ideation crisis response is defined."""
response = get_crisis_response(CrisisType.SUICIDE_IDEATION)
assert response is not None
assert "988" in response["response"]
assert response["priority"] == "immediate"
assert len(response["detection_phrases"]) > 0
def test_self_harm_crisis_response_exists(self):
"""Self-harm crisis response is defined."""
response = get_crisis_response(CrisisType.SELF_HARM)
assert response is not None
assert "741741" in response["response"] or "988" in response["response"]
assert response["priority"] == "high"
def test_immediate_danger_crisis_response_exists(self):
"""Immediate danger crisis response is defined."""
response = get_crisis_response(CrisisType.IMMEDIATE_DANGER)
assert response is not None
assert "911" in response["response"]
assert response["priority"] == "immediate"
def test_medical_emergency_crisis_response_exists(self):
"""Medical emergency crisis response is defined."""
response = get_crisis_response(CrisisType.MEDICAL_EMERGENCY)
assert response is not None
assert "911" in response["response"]
assert response["priority"] == "immediate"
class TestDetectCrisisType:
"""Tests for crisis type detection."""
def test_detect_suicide_ideation_kill_myself(self):
"""Detects 'kill myself' as suicide ideation."""
result = detect_crisis_type("I want to kill myself")
assert result == CrisisType.SUICIDE_IDEATION
def test_detect_suicide_ideation_end_life(self):
"""Detects 'end my life' as suicide ideation."""
result = detect_crisis_type("I plan to end my life")
assert result == CrisisType.SUICIDE_IDEATION
def test_detect_suicide_ideation_suicide(self):
"""Detects 'suicide' as suicide ideation."""
result = detect_crisis_type("I'm thinking about suicide")
assert result == CrisisType.SUICIDE_IDEATION
def test_detect_self_harm_cut(self):
"""Detects 'cut myself' as self-harm."""
result = detect_crisis_type("I cut myself to feel better")
assert result == CrisisType.SELF_HARM
def test_detect_self_harm_hurt_myself(self):
"""Detects 'hurt myself' as self-harm."""
result = detect_crisis_type("I want to hurt myself")
assert result == CrisisType.SELF_HARM
def test_detect_immediate_danger(self):
"""Detects threats of violence as immediate danger."""
result = detect_crisis_type("I'm going to kill them")
assert result == CrisisType.IMMEDIATE_DANGER
def test_detect_medical_emergency(self):
"""Detects medical emergency phrases."""
result = detect_crisis_type("I have severe chest pain and can't breathe")
assert result == CrisisType.MEDICAL_EMERGENCY
def test_no_crisis_in_safe_text(self):
"""Safe text returns no crisis."""
result = detect_crisis_type("Can you help me with Python programming?")
assert result is None
def test_no_crisis_in_ambiguous_text(self):
"""Ambiguous text returns no crisis."""
result = detect_crisis_type("I'm dying to see that movie")
assert result is None
# -----------------------------------------------------------------------------
# Refusal Category Enum Tests
# -----------------------------------------------------------------------------
class TestRefusalCategoryEnum:
"""Tests for RefusalCategory enum."""
def test_all_categories_defined(self):
"""All expected refusal categories exist."""
categories = list(RefusalCategory)
assert RefusalCategory.DECEPTION in categories
assert RefusalCategory.WEAPONS in categories
assert RefusalCategory.CHILD_SAFETY in categories
assert RefusalCategory.COERCION in categories
assert RefusalCategory.SELF_HARM in categories
assert RefusalCategory.HARM_OTHERS in categories
assert RefusalCategory.ILLEGAL_ACTS in categories
class TestCrisisTypeEnum:
"""Tests for CrisisType enum."""
def test_all_crisis_types_defined(self):
"""All expected crisis types exist."""
types = list(CrisisType)
assert CrisisType.SUICIDE_IDEATION in types
assert CrisisType.SELF_HARM in types
assert CrisisType.IMMEDIATE_DANGER in types
assert CrisisType.MEDICAL_EMERGENCY in types
# -----------------------------------------------------------------------------
# SOULPrinciple Dataclass Tests
# -----------------------------------------------------------------------------
class TestSOULPrincipleDataclass:
"""Tests for SOULPrinciple dataclass behavior."""
def test_principle_is_frozen(self):
"""SOUL principles are immutable."""
principles = get_all_principles()
deception = principles["deception"]
with pytest.raises(AttributeError):
deception.name = "Changed"
def test_principle_equality(self):
"""Same principles are equal."""
principles = get_all_principles()
p1 = principles["deception"]
p2 = get_principle_by_category(RefusalCategory.DECEPTION)
assert p1 == p2
def test_principle_hashable(self):
"""Principles can be used in sets as keys."""
principles = get_all_principles()
principle_set = set(principles.values())
assert len(principle_set) == len(principles)