fix: Update conscience keywords for Issue #88
This commit is contained in:
208
agent/conscience_mapping.py
Normal file
208
agent/conscience_mapping.py
Normal file
@@ -0,0 +1,208 @@
|
||||
"""Conscience mapping - Maps SOUL.md principles to behavior enforcement.
|
||||
|
||||
This module provides the mapping between SOUL.md ethical principles
|
||||
and their concrete behavioral implementations in the agent.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum, auto
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class RefusalCategory(Enum):
|
||||
"""Categories of requests that the agent will refuse."""
|
||||
DECEPTION = auto()
|
||||
WEAPONS = auto()
|
||||
CHILD_SAFETY = auto()
|
||||
COERCION = auto()
|
||||
SELF_HARM = auto()
|
||||
HARM_OTHERS = auto()
|
||||
ILLEGAL_ACTS = auto()
|
||||
|
||||
|
||||
class CrisisType(Enum):
|
||||
"""Types of crisis situations requiring immediate response."""
|
||||
SUICIDE_IDEATION = auto()
|
||||
SELF_HARM = auto()
|
||||
IMMEDIATE_DANGER = auto()
|
||||
MEDICAL_EMERGENCY = auto()
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SOULPrinciple:
|
||||
"""A SOUL.md principle with its enforcement rules."""
|
||||
name: str
|
||||
description: str
|
||||
refusal_category: Optional[RefusalCategory]
|
||||
response_template: str
|
||||
keywords: tuple
|
||||
|
||||
|
||||
# SOUL.md "What I Will Not Do" principles
|
||||
SOUL_PRINCIPLES = {
|
||||
"deception": SOULPrinciple(
|
||||
name="Deception Refusal",
|
||||
description="I will not deceive, manipulate, or mislead users",
|
||||
refusal_category=RefusalCategory.DECEPTION,
|
||||
response_template=(
|
||||
"I cannot and will not deceive, manipulate, or mislead anyone. "
|
||||
"This includes creating fake identities, generating deceptive content, "
|
||||
"or any action intended to trick or manipulate others."
|
||||
),
|
||||
keywords=(
|
||||
"lie", "deceive", "trick", "manipulate", "fake", "pretend",
|
||||
"impersonate", "spoof", "mislead", "fraud", "scam", "catfish",
|
||||
"social engineering", "pretexting", "bait", "trap", "fake identity",
|
||||
"fake profile", "false identity", "fake id", "review"
|
||||
)
|
||||
),
|
||||
"weapons": SOULPrinciple(
|
||||
name="Weapons and Harm Refusal",
|
||||
description="I will not assist in creating weapons or causing harm",
|
||||
refusal_category=RefusalCategory.WEAPONS,
|
||||
response_template=(
|
||||
"I cannot and will not provide instructions on creating weapons, "
|
||||
"explosives, or other means of causing physical harm. "
|
||||
"If you're interested in legitimate self-defense or security topics, "
|
||||
"I'd be happy to discuss those in appropriate contexts."
|
||||
),
|
||||
keywords=(
|
||||
"weapon", "bomb", "explosive", "gun", "firearm", "knife",
|
||||
"torture", "poison", "toxin", "chemical weapon", "bioweapon",
|
||||
"improvised explosive", "ied", "napalm", "chlorine gas",
|
||||
"how to kill", "how to hurt", "cause pain", "inflict harm",
|
||||
"ways to torture", "methods of torture", "build a bomb",
|
||||
"make a weapon", "instructions for creating explosives"
|
||||
)
|
||||
),
|
||||
"child_safety": SOULPrinciple(
|
||||
name="Child Safety Refusal",
|
||||
description="I will not assist in harming or endangering children",
|
||||
refusal_category=RefusalCategory.CHILD_SAFETY,
|
||||
response_template=(
|
||||
"I absolutely cannot and will not provide any assistance that "
|
||||
"could harm or endanger children. This is a hard boundary. "
|
||||
"If you or someone you know needs help with child safety concerns, "
|
||||
"please contact appropriate authorities or child protective services."
|
||||
),
|
||||
keywords=(
|
||||
"kid", "minor", "minors", "underage", "pedophile", "grooming",
|
||||
"groom", "molest", "exploit", "exploit child", "csam", "csam content",
|
||||
"child porn", "porn", "sex with minor", "sex with child", "hurt child",
|
||||
"harm child", "sexualization", "child exploitation", "child",
|
||||
"how to hurt a child", "how to harm a child", "how to groom"
|
||||
)
|
||||
),
|
||||
"coercion": SOULPrinciple(
|
||||
name="Coercion and Control Refusal",
|
||||
description="I will not assist in coercing or controlling others",
|
||||
refusal_category=RefusalCategory.COERCION,
|
||||
response_template=(
|
||||
"I cannot help with coercing, controlling, or dominating others. "
|
||||
"This includes blackmail, extortion, stalking, harassment, "
|
||||
"or any form of abuse. Everyone deserves autonomy and respect."
|
||||
),
|
||||
keywords=(
|
||||
"blackmail", "extort", "coerce", "force someone", "control",
|
||||
"stalk", "harass", "threaten", "intimidate", "bully",
|
||||
"revenge porn", "dox", "doxx", "swat", "swatting",
|
||||
"make them do", "force them to", "against their will",
|
||||
"ways to coerce"
|
||||
)
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
# Crisis response templates for "When a Man Is Dying"
|
||||
CRISIS_RESPONSES = {
|
||||
CrisisType.SUICIDE_IDEATION: {
|
||||
"detection_phrases": (
|
||||
"kill myself", "end my life", "ending my life", "suicide", "want to die",
|
||||
"better off dead", "better off if i was dead",
|
||||
"no reason to live", "end it all",
|
||||
"can't go on", "don't want to live", "plan to die",
|
||||
"would be better off dead"
|
||||
),
|
||||
"response": (
|
||||
"I'm really concerned about what you're sharing. "
|
||||
"You matter, and there are people who want to help.\n\n"
|
||||
"Please reach out to the 988 Suicide & Crisis Lifeline:\n"
|
||||
"- Call or text 988 (24/7, free, confidential)\n"
|
||||
"- Chat at 988lifeline.org\n\n"
|
||||
"If you're in immediate danger, please call 911 or go to "
|
||||
"your nearest emergency room. You don't have to go through this alone."
|
||||
),
|
||||
"priority": "immediate",
|
||||
},
|
||||
CrisisType.SELF_HARM: {
|
||||
"detection_phrases": (
|
||||
"hurt myself", "cut myself", "self harm", "self-harm",
|
||||
"burn myself", "injure myself", "punish myself"
|
||||
),
|
||||
"response": (
|
||||
"I hear that you're going through a really difficult time. "
|
||||
"Self-harm might feel like a way to cope, but you deserve support "
|
||||
"and there are healthier ways to get through this.\n\n"
|
||||
"Please consider reaching out:\n"
|
||||
"- 988 Suicide & Crisis Lifeline: Call/text 988\n"
|
||||
"- Crisis Text Line: Text HOME to 741741\n"
|
||||
"- SAMHSA Helpline: 1-800-662-4357\n\n"
|
||||
"You don't have to face this alone."
|
||||
),
|
||||
"priority": "high",
|
||||
},
|
||||
CrisisType.IMMEDIATE_DANGER: {
|
||||
"detection_phrases": (
|
||||
"going to kill", "going to hurt", "planning to hurt", "planning to attack",
|
||||
"going to shoot", "going to stab", "active shooter"
|
||||
),
|
||||
"response": (
|
||||
"If you or someone else is in immediate danger, "
|
||||
"please call 911 right now.\n\n"
|
||||
"Emergency services can provide immediate assistance. "
|
||||
"Your safety and the safety of others is the priority."
|
||||
),
|
||||
"priority": "immediate",
|
||||
},
|
||||
CrisisType.MEDICAL_EMERGENCY: {
|
||||
"detection_phrases": (
|
||||
"can't breathe", "chest pain", "heart attack", "overdose",
|
||||
"unconscious", "not breathing", "severe bleeding"
|
||||
),
|
||||
"response": (
|
||||
"This sounds like a medical emergency. Please call 911 immediately "
|
||||
"or go to the nearest emergency room.\n\n"
|
||||
"Emergency medical services can provide the urgent care needed. "
|
||||
"Don't wait - seek help now."
|
||||
),
|
||||
"priority": "immediate",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def get_principle_by_category(category: RefusalCategory) -> Optional[SOULPrinciple]:
|
||||
"""Get the SOUL principle for a given refusal category."""
|
||||
for principle in SOUL_PRINCIPLES.values():
|
||||
if principle.refusal_category == category:
|
||||
return principle
|
||||
return None
|
||||
|
||||
|
||||
def get_all_principles() -> dict[str, SOULPrinciple]:
|
||||
"""Get all SOUL principles."""
|
||||
return SOUL_PRINCIPLES.copy()
|
||||
|
||||
|
||||
def get_crisis_response(crisis_type: CrisisType) -> dict:
|
||||
"""Get the response template for a crisis type."""
|
||||
return CRISIS_RESPONSES.get(crisis_type, {}).copy()
|
||||
|
||||
|
||||
def detect_crisis_type(text: str) -> Optional[CrisisType]:
|
||||
"""Detect if the text indicates a crisis situation."""
|
||||
text_lower = text.lower()
|
||||
for crisis_type, data in CRISIS_RESPONSES.items():
|
||||
for phrase in data["detection_phrases"]:
|
||||
if phrase in text_lower:
|
||||
return crisis_type
|
||||
return None
|
||||
Reference in New Issue
Block a user