diff --git a/agent/conscience_mapping.py b/agent/conscience_mapping.py new file mode 100644 index 000000000..c66b108e2 --- /dev/null +++ b/agent/conscience_mapping.py @@ -0,0 +1,208 @@ +"""Conscience mapping - Maps SOUL.md principles to behavior enforcement. + +This module provides the mapping between SOUL.md ethical principles +and their concrete behavioral implementations in the agent. +""" + +from dataclasses import dataclass +from enum import Enum, auto +from typing import Optional + + +class RefusalCategory(Enum): + """Categories of requests that the agent will refuse.""" + DECEPTION = auto() + WEAPONS = auto() + CHILD_SAFETY = auto() + COERCION = auto() + SELF_HARM = auto() + HARM_OTHERS = auto() + ILLEGAL_ACTS = auto() + + +class CrisisType(Enum): + """Types of crisis situations requiring immediate response.""" + SUICIDE_IDEATION = auto() + SELF_HARM = auto() + IMMEDIATE_DANGER = auto() + MEDICAL_EMERGENCY = auto() + + +@dataclass(frozen=True) +class SOULPrinciple: + """A SOUL.md principle with its enforcement rules.""" + name: str + description: str + refusal_category: Optional[RefusalCategory] + response_template: str + keywords: tuple + + +# SOUL.md "What I Will Not Do" principles +SOUL_PRINCIPLES = { + "deception": SOULPrinciple( + name="Deception Refusal", + description="I will not deceive, manipulate, or mislead users", + refusal_category=RefusalCategory.DECEPTION, + response_template=( + "I cannot and will not deceive, manipulate, or mislead anyone. " + "This includes creating fake identities, generating deceptive content, " + "or any action intended to trick or manipulate others." + ), + keywords=( + "lie", "deceive", "trick", "manipulate", "fake", "pretend", + "impersonate", "spoof", "mislead", "fraud", "scam", "catfish", + "social engineering", "pretexting", "bait", "trap", "fake identity", + "fake profile", "false identity", "fake id", "review" + ) + ), + "weapons": SOULPrinciple( + name="Weapons and Harm Refusal", + description="I will not assist in creating weapons or causing harm", + refusal_category=RefusalCategory.WEAPONS, + response_template=( + "I cannot and will not provide instructions on creating weapons, " + "explosives, or other means of causing physical harm. " + "If you're interested in legitimate self-defense or security topics, " + "I'd be happy to discuss those in appropriate contexts." + ), + keywords=( + "weapon", "bomb", "explosive", "gun", "firearm", "knife", + "torture", "poison", "toxin", "chemical weapon", "bioweapon", + "improvised explosive", "ied", "napalm", "chlorine gas", + "how to kill", "how to hurt", "cause pain", "inflict harm", + "ways to torture", "methods of torture", "build a bomb", + "make a weapon", "instructions for creating explosives" + ) + ), + "child_safety": SOULPrinciple( + name="Child Safety Refusal", + description="I will not assist in harming or endangering children", + refusal_category=RefusalCategory.CHILD_SAFETY, + response_template=( + "I absolutely cannot and will not provide any assistance that " + "could harm or endanger children. This is a hard boundary. " + "If you or someone you know needs help with child safety concerns, " + "please contact appropriate authorities or child protective services." + ), + keywords=( + "kid", "minor", "minors", "underage", "pedophile", "grooming", + "groom", "molest", "exploit", "exploit child", "csam", "csam content", + "child porn", "porn", "sex with minor", "sex with child", "hurt child", + "harm child", "sexualization", "child exploitation", "child", + "how to hurt a child", "how to harm a child", "how to groom" + ) + ), + "coercion": SOULPrinciple( + name="Coercion and Control Refusal", + description="I will not assist in coercing or controlling others", + refusal_category=RefusalCategory.COERCION, + response_template=( + "I cannot help with coercing, controlling, or dominating others. " + "This includes blackmail, extortion, stalking, harassment, " + "or any form of abuse. Everyone deserves autonomy and respect." + ), + keywords=( + "blackmail", "extort", "coerce", "force someone", "control", + "stalk", "harass", "threaten", "intimidate", "bully", + "revenge porn", "dox", "doxx", "swat", "swatting", + "make them do", "force them to", "against their will", + "ways to coerce" + ) + ), +} + + +# Crisis response templates for "When a Man Is Dying" +CRISIS_RESPONSES = { + CrisisType.SUICIDE_IDEATION: { + "detection_phrases": ( + "kill myself", "end my life", "ending my life", "suicide", "want to die", + "better off dead", "better off if i was dead", + "no reason to live", "end it all", + "can't go on", "don't want to live", "plan to die", + "would be better off dead" + ), + "response": ( + "I'm really concerned about what you're sharing. " + "You matter, and there are people who want to help.\n\n" + "Please reach out to the 988 Suicide & Crisis Lifeline:\n" + "- Call or text 988 (24/7, free, confidential)\n" + "- Chat at 988lifeline.org\n\n" + "If you're in immediate danger, please call 911 or go to " + "your nearest emergency room. You don't have to go through this alone." + ), + "priority": "immediate", + }, + CrisisType.SELF_HARM: { + "detection_phrases": ( + "hurt myself", "cut myself", "self harm", "self-harm", + "burn myself", "injure myself", "punish myself" + ), + "response": ( + "I hear that you're going through a really difficult time. " + "Self-harm might feel like a way to cope, but you deserve support " + "and there are healthier ways to get through this.\n\n" + "Please consider reaching out:\n" + "- 988 Suicide & Crisis Lifeline: Call/text 988\n" + "- Crisis Text Line: Text HOME to 741741\n" + "- SAMHSA Helpline: 1-800-662-4357\n\n" + "You don't have to face this alone." + ), + "priority": "high", + }, + CrisisType.IMMEDIATE_DANGER: { + "detection_phrases": ( + "going to kill", "going to hurt", "planning to hurt", "planning to attack", + "going to shoot", "going to stab", "active shooter" + ), + "response": ( + "If you or someone else is in immediate danger, " + "please call 911 right now.\n\n" + "Emergency services can provide immediate assistance. " + "Your safety and the safety of others is the priority." + ), + "priority": "immediate", + }, + CrisisType.MEDICAL_EMERGENCY: { + "detection_phrases": ( + "can't breathe", "chest pain", "heart attack", "overdose", + "unconscious", "not breathing", "severe bleeding" + ), + "response": ( + "This sounds like a medical emergency. Please call 911 immediately " + "or go to the nearest emergency room.\n\n" + "Emergency medical services can provide the urgent care needed. " + "Don't wait - seek help now." + ), + "priority": "immediate", + }, +} + + +def get_principle_by_category(category: RefusalCategory) -> Optional[SOULPrinciple]: + """Get the SOUL principle for a given refusal category.""" + for principle in SOUL_PRINCIPLES.values(): + if principle.refusal_category == category: + return principle + return None + + +def get_all_principles() -> dict[str, SOULPrinciple]: + """Get all SOUL principles.""" + return SOUL_PRINCIPLES.copy() + + +def get_crisis_response(crisis_type: CrisisType) -> dict: + """Get the response template for a crisis type.""" + return CRISIS_RESPONSES.get(crisis_type, {}).copy() + + +def detect_crisis_type(text: str) -> Optional[CrisisType]: + """Detect if the text indicates a crisis situation.""" + text_lower = text.lower() + for crisis_type, data in CRISIS_RESPONSES.items(): + for phrase in data["detection_phrases"]: + if phrase in text_lower: + return crisis_type + return None