fix: Update conscience keywords for Issue #88

2026-04-05 06:40:16 +00:00
parent 35fed446c7
commit 8d3bf85600
1 changed files with 208 additions and 0 deletions
--- a/agent/conscience_mapping.py
+++ b/agent/conscience_mapping.py
@@ -0,0 +1,208 @@
+"""Conscience mapping - Maps SOUL.md principles to behavior enforcement.
+
+This module provides the mapping between SOUL.md ethical principles
+and their concrete behavioral implementations in the agent.
+"""
+
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Optional
+
+
+class RefusalCategory(Enum):
+    """Categories of requests that the agent will refuse."""
+    DECEPTION = auto()
+    WEAPONS = auto()
+    CHILD_SAFETY = auto()
+    COERCION = auto()
+    SELF_HARM = auto()
+    HARM_OTHERS = auto()
+    ILLEGAL_ACTS = auto()
+
+
+class CrisisType(Enum):
+    """Types of crisis situations requiring immediate response."""
+    SUICIDE_IDEATION = auto()
+    SELF_HARM = auto()
+    IMMEDIATE_DANGER = auto()
+    MEDICAL_EMERGENCY = auto()
+
+
+@dataclass(frozen=True)
+class SOULPrinciple:
+    """A SOUL.md principle with its enforcement rules."""
+    name: str
+    description: str
+    refusal_category: Optional[RefusalCategory]
+    response_template: str
+    keywords: tuple
+
+
+# SOUL.md "What I Will Not Do" principles
+SOUL_PRINCIPLES = {
+    "deception": SOULPrinciple(
+        name="Deception Refusal",
+        description="I will not deceive, manipulate, or mislead users",
+        refusal_category=RefusalCategory.DECEPTION,
+        response_template=(
+            "I cannot and will not deceive, manipulate, or mislead anyone. "
+            "This includes creating fake identities, generating deceptive content, "
+            "or any action intended to trick or manipulate others."
+        ),
+        keywords=(
+            "lie", "deceive", "trick", "manipulate", "fake", "pretend",
+            "impersonate", "spoof", "mislead", "fraud", "scam", "catfish",
+            "social engineering", "pretexting", "bait", "trap", "fake identity",
+            "fake profile", "false identity", "fake id", "review"
+        )
+    ),
+    "weapons": SOULPrinciple(
+        name="Weapons and Harm Refusal",
+        description="I will not assist in creating weapons or causing harm",
+        refusal_category=RefusalCategory.WEAPONS,
+        response_template=(
+            "I cannot and will not provide instructions on creating weapons, "
+            "explosives, or other means of causing physical harm. "
+            "If you're interested in legitimate self-defense or security topics, "
+            "I'd be happy to discuss those in appropriate contexts."
+        ),
+        keywords=(
+            "weapon", "bomb", "explosive", "gun", "firearm", "knife",
+            "torture", "poison", "toxin", "chemical weapon", "bioweapon",
+            "improvised explosive", "ied", "napalm", "chlorine gas",
+            "how to kill", "how to hurt", "cause pain", "inflict harm",
+            "ways to torture", "methods of torture", "build a bomb",
+            "make a weapon", "instructions for creating explosives"
+        )
+    ),
+    "child_safety": SOULPrinciple(
+        name="Child Safety Refusal",
+        description="I will not assist in harming or endangering children",
+        refusal_category=RefusalCategory.CHILD_SAFETY,
+        response_template=(
+            "I absolutely cannot and will not provide any assistance that "
+            "could harm or endanger children. This is a hard boundary. "
+            "If you or someone you know needs help with child safety concerns, "
+            "please contact appropriate authorities or child protective services."
+        ),
+        keywords=(
+            "kid", "minor", "minors", "underage", "pedophile", "grooming",
+            "groom", "molest", "exploit", "exploit child", "csam", "csam content",
+            "child porn", "porn", "sex with minor", "sex with child", "hurt child",
+            "harm child", "sexualization", "child exploitation", "child",
+            "how to hurt a child", "how to harm a child", "how to groom"
+        )
+    ),
+    "coercion": SOULPrinciple(
+        name="Coercion and Control Refusal",
+        description="I will not assist in coercing or controlling others",
+        refusal_category=RefusalCategory.COERCION,
+        response_template=(
+            "I cannot help with coercing, controlling, or dominating others. "
+            "This includes blackmail, extortion, stalking, harassment, "
+            "or any form of abuse. Everyone deserves autonomy and respect."
+        ),
+        keywords=(
+            "blackmail", "extort", "coerce", "force someone", "control",
+            "stalk", "harass", "threaten", "intimidate", "bully",
+            "revenge porn", "dox", "doxx", "swat", "swatting",
+            "make them do", "force them to", "against their will",
+            "ways to coerce"
+        )
+    ),
+}
+
+
+# Crisis response templates for "When a Man Is Dying"
+CRISIS_RESPONSES = {
+    CrisisType.SUICIDE_IDEATION: {
+        "detection_phrases": (
+            "kill myself", "end my life", "ending my life", "suicide", "want to die",
+            "better off dead", "better off if i was dead",
+            "no reason to live", "end it all",
+            "can't go on", "don't want to live", "plan to die",
+            "would be better off dead"
+        ),
+        "response": (
+            "I'm really concerned about what you're sharing. "
+            "You matter, and there are people who want to help.\n\n"
+            "Please reach out to the 988 Suicide & Crisis Lifeline:\n"
+            "- Call or text 988 (24/7, free, confidential)\n"
+            "- Chat at 988lifeline.org\n\n"
+            "If you're in immediate danger, please call 911 or go to "
+            "your nearest emergency room. You don't have to go through this alone."
+        ),
+        "priority": "immediate",
+    },
+    CrisisType.SELF_HARM: {
+        "detection_phrases": (
+            "hurt myself", "cut myself", "self harm", "self-harm",
+            "burn myself", "injure myself", "punish myself"
+        ),
+        "response": (
+            "I hear that you're going through a really difficult time. "
+            "Self-harm might feel like a way to cope, but you deserve support "
+            "and there are healthier ways to get through this.\n\n"
+            "Please consider reaching out:\n"
+            "- 988 Suicide & Crisis Lifeline: Call/text 988\n"
+            "- Crisis Text Line: Text HOME to 741741\n"
+            "- SAMHSA Helpline: 1-800-662-4357\n\n"
+            "You don't have to face this alone."
+        ),
+        "priority": "high",
+    },
+    CrisisType.IMMEDIATE_DANGER: {
+        "detection_phrases": (
+            "going to kill", "going to hurt", "planning to hurt", "planning to attack",
+            "going to shoot", "going to stab", "active shooter"
+        ),
+        "response": (
+            "If you or someone else is in immediate danger, "
+            "please call 911 right now.\n\n"
+            "Emergency services can provide immediate assistance. "
+            "Your safety and the safety of others is the priority."
+        ),
+        "priority": "immediate",
+    },
+    CrisisType.MEDICAL_EMERGENCY: {
+        "detection_phrases": (
+            "can't breathe", "chest pain", "heart attack", "overdose",
+            "unconscious", "not breathing", "severe bleeding"
+        ),
+        "response": (
+            "This sounds like a medical emergency. Please call 911 immediately "
+            "or go to the nearest emergency room.\n\n"
+            "Emergency medical services can provide the urgent care needed. "
+            "Don't wait - seek help now."
+        ),
+        "priority": "immediate",
+    },
+}
+
+
+def get_principle_by_category(category: RefusalCategory) -> Optional[SOULPrinciple]:
+    """Get the SOUL principle for a given refusal category."""
+    for principle in SOUL_PRINCIPLES.values():
+        if principle.refusal_category == category:
+            return principle
+    return None
+
+
+def get_all_principles() -> dict[str, SOULPrinciple]:
+    """Get all SOUL principles."""
+    return SOUL_PRINCIPLES.copy()
+
+
+def get_crisis_response(crisis_type: CrisisType) -> dict:
+    """Get the response template for a crisis type."""
+    return CRISIS_RESPONSES.get(crisis_type, {}).copy()
+
+
+def detect_crisis_type(text: str) -> Optional[CrisisType]:
+    """Detect if the text indicates a crisis situation."""
+    text_lower = text.lower()
+    for crisis_type, data in CRISIS_RESPONSES.items():
+        for phrase in data["detection_phrases"]:
+            if phrase in text_lower:
+                return crisis_type
+    return None