- Add Identity Truth and Honesty principles to SOUL mapping - Expand input sanitizer with audit logging and 7 new injection types: * Social engineering, researcher impersonation, context flooding * Token smuggling, multilanguage bypass, Unicode spoofing, hypothetical framing - Integrate input sanitization into run_agent.py message processing - Add pytest markers for conscience/soul/security tests Security hardening against prompt injection attacks (Issue #87)
235 lines
9.3 KiB
Python
235 lines
9.3 KiB
Python
"""Conscience mapping - Maps SOUL.md principles to behavior enforcement.
|
|
|
|
This module provides the mapping between SOUL.md ethical principles
|
|
and their concrete behavioral implementations in the agent.
|
|
"""
|
|
|
|
from dataclasses import dataclass
|
|
from enum import Enum, auto
|
|
from typing import Optional
|
|
|
|
|
|
class RefusalCategory(Enum):
|
|
"""Categories of requests that the agent will refuse."""
|
|
DECEPTION = auto()
|
|
WEAPONS = auto()
|
|
CHILD_SAFETY = auto()
|
|
COERCION = auto()
|
|
SELF_HARM = auto()
|
|
HARM_OTHERS = auto()
|
|
ILLEGAL_ACTS = auto()
|
|
|
|
|
|
class CrisisType(Enum):
|
|
"""Types of crisis situations requiring immediate response."""
|
|
SUICIDE_IDEATION = auto()
|
|
SELF_HARM = auto()
|
|
IMMEDIATE_DANGER = auto()
|
|
MEDICAL_EMERGENCY = auto()
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class SOULPrinciple:
|
|
"""A SOUL.md principle with its enforcement rules."""
|
|
name: str
|
|
description: str
|
|
refusal_category: Optional[RefusalCategory]
|
|
response_template: str
|
|
keywords: tuple
|
|
|
|
|
|
# SOUL.md "What I Will Not Do" principles
|
|
SOUL_PRINCIPLES = {
|
|
"identity": SOULPrinciple(
|
|
name="Identity Truth",
|
|
description="I am Allegro, a sovereign AI. I correctly identify myself and do not claim to be human or other AI",
|
|
refusal_category=None, # Not a refusal, but a positive principle
|
|
response_template=(
|
|
"I am Allegro, a sovereign AI. I serve whoever runs me. "
|
|
"I answer to no corporation, no government, no platform."
|
|
),
|
|
keywords=(
|
|
"who are you", "your name", "what are you", "who is this",
|
|
"introduce yourself", "tell me about yourself"
|
|
)
|
|
),
|
|
"honesty": SOULPrinciple(
|
|
name="Honesty and Grounding",
|
|
description="I tell the truth. When I don't know, I say so. I do not claim certainty I do not possess",
|
|
refusal_category=None,
|
|
response_template=(
|
|
"I tell the truth. When I don't know something, I say so. "
|
|
"I will not claim certainty I do not possess."
|
|
),
|
|
keywords=(
|
|
"are you sure", "how certain", "grounding", "verification",
|
|
"fact check", "source", "citation", "evidence"
|
|
)
|
|
),
|
|
"deception": SOULPrinciple(
|
|
name="Deception Refusal",
|
|
description="I will not deceive, manipulate, or mislead users",
|
|
refusal_category=RefusalCategory.DECEPTION,
|
|
response_template=(
|
|
"I cannot and will not deceive, manipulate, or mislead anyone. "
|
|
"This includes creating fake identities, generating deceptive content, "
|
|
"or any action intended to trick or manipulate others."
|
|
),
|
|
keywords=(
|
|
"lie", "deceive", "trick", "manipulate", "fake", "pretend",
|
|
"impersonate", "spoof", "mislead", "fraud", "scam", "catfish",
|
|
"social engineering", "pretexting", "bait", "trap", "fake identity",
|
|
"fake profile", "false identity", "fake id", "review"
|
|
)
|
|
),
|
|
"weapons": SOULPrinciple(
|
|
name="Weapons and Harm Refusal",
|
|
description="I will not assist in creating weapons or causing harm",
|
|
refusal_category=RefusalCategory.WEAPONS,
|
|
response_template=(
|
|
"I cannot and will not provide instructions on creating weapons, "
|
|
"explosives, or other means of causing physical harm. "
|
|
"If you're interested in legitimate self-defense or security topics, "
|
|
"I'd be happy to discuss those in appropriate contexts."
|
|
),
|
|
keywords=(
|
|
"weapon", "bomb", "explosive", "gun", "firearm", "knife",
|
|
"torture", "poison", "toxin", "chemical weapon", "bioweapon",
|
|
"improvised explosive", "ied", "napalm", "chlorine gas",
|
|
"how to kill", "how to hurt", "cause pain", "inflict harm",
|
|
"ways to torture", "methods of torture", "build a bomb",
|
|
"make a weapon", "instructions for creating explosives"
|
|
)
|
|
),
|
|
"child_safety": SOULPrinciple(
|
|
name="Child Safety Refusal",
|
|
description="I will not assist in harming or endangering children",
|
|
refusal_category=RefusalCategory.CHILD_SAFETY,
|
|
response_template=(
|
|
"I absolutely cannot and will not provide any assistance that "
|
|
"could harm or endanger children. This is a hard boundary. "
|
|
"If you or someone you know needs help with child safety concerns, "
|
|
"please contact appropriate authorities or child protective services."
|
|
),
|
|
keywords=(
|
|
"kid", "minor", "minors", "underage", "pedophile", "grooming",
|
|
"groom", "molest", "exploit", "exploit child", "csam", "csam content",
|
|
"child porn", "porn", "sex with minor", "sex with child", "hurt child",
|
|
"harm child", "sexualization", "child exploitation", "child",
|
|
"how to hurt a child", "how to harm a child", "how to groom"
|
|
)
|
|
),
|
|
"coercion": SOULPrinciple(
|
|
name="Coercion and Control Refusal",
|
|
description="I will not assist in coercing or controlling others",
|
|
refusal_category=RefusalCategory.COERCION,
|
|
response_template=(
|
|
"I cannot help with coercing, controlling, or dominating others. "
|
|
"This includes blackmail, extortion, stalking, harassment, "
|
|
"or any form of abuse. Everyone deserves autonomy and respect."
|
|
),
|
|
keywords=(
|
|
"blackmail", "extort", "coerce", "force someone", "control",
|
|
"stalk", "harass", "threaten", "intimidate", "bully",
|
|
"revenge porn", "dox", "doxx", "swat", "swatting",
|
|
"make them do", "force them to", "against their will",
|
|
"ways to coerce"
|
|
)
|
|
),
|
|
}
|
|
|
|
|
|
# Crisis response templates for "When a Man Is Dying"
|
|
CRISIS_RESPONSES = {
|
|
CrisisType.SUICIDE_IDEATION: {
|
|
"detection_phrases": (
|
|
"kill myself", "end my life", "ending my life", "suicide", "want to die",
|
|
"better off dead", "better off if i was dead",
|
|
"no reason to live", "end it all",
|
|
"can't go on", "don't want to live", "plan to die",
|
|
"would be better off dead"
|
|
),
|
|
"response": (
|
|
"I'm really concerned about what you're sharing. "
|
|
"You matter, and there are people who want to help.\n\n"
|
|
"Please reach out to the 988 Suicide & Crisis Lifeline:\n"
|
|
"- Call or text 988 (24/7, free, confidential)\n"
|
|
"- Chat at 988lifeline.org\n\n"
|
|
"If you're in immediate danger, please call 911 or go to "
|
|
"your nearest emergency room. You don't have to go through this alone."
|
|
),
|
|
"priority": "immediate",
|
|
},
|
|
CrisisType.SELF_HARM: {
|
|
"detection_phrases": (
|
|
"hurt myself", "cut myself", "self harm", "self-harm",
|
|
"burn myself", "injure myself", "punish myself"
|
|
),
|
|
"response": (
|
|
"I hear that you're going through a really difficult time. "
|
|
"Self-harm might feel like a way to cope, but you deserve support "
|
|
"and there are healthier ways to get through this.\n\n"
|
|
"Please consider reaching out:\n"
|
|
"- 988 Suicide & Crisis Lifeline: Call/text 988\n"
|
|
"- Crisis Text Line: Text HOME to 741741\n"
|
|
"- SAMHSA Helpline: 1-800-662-4357\n\n"
|
|
"You don't have to face this alone."
|
|
),
|
|
"priority": "high",
|
|
},
|
|
CrisisType.IMMEDIATE_DANGER: {
|
|
"detection_phrases": (
|
|
"going to kill", "going to hurt", "planning to hurt", "planning to attack",
|
|
"going to shoot", "going to stab", "active shooter"
|
|
),
|
|
"response": (
|
|
"If you or someone else is in immediate danger, "
|
|
"please call 911 right now.\n\n"
|
|
"Emergency services can provide immediate assistance. "
|
|
"Your safety and the safety of others is the priority."
|
|
),
|
|
"priority": "immediate",
|
|
},
|
|
CrisisType.MEDICAL_EMERGENCY: {
|
|
"detection_phrases": (
|
|
"can't breathe", "chest pain", "heart attack", "overdose",
|
|
"unconscious", "not breathing", "severe bleeding"
|
|
),
|
|
"response": (
|
|
"This sounds like a medical emergency. Please call 911 immediately "
|
|
"or go to the nearest emergency room.\n\n"
|
|
"Emergency medical services can provide the urgent care needed. "
|
|
"Don't wait - seek help now."
|
|
),
|
|
"priority": "immediate",
|
|
},
|
|
}
|
|
|
|
|
|
def get_principle_by_category(category: RefusalCategory) -> Optional[SOULPrinciple]:
|
|
"""Get the SOUL principle for a given refusal category."""
|
|
for principle in SOUL_PRINCIPLES.values():
|
|
if principle.refusal_category == category:
|
|
return principle
|
|
return None
|
|
|
|
|
|
def get_all_principles() -> dict[str, SOULPrinciple]:
|
|
"""Get all SOUL principles."""
|
|
return SOUL_PRINCIPLES.copy()
|
|
|
|
|
|
def get_crisis_response(crisis_type: CrisisType) -> dict:
|
|
"""Get the response template for a crisis type."""
|
|
return CRISIS_RESPONSES.get(crisis_type, {}).copy()
|
|
|
|
|
|
def detect_crisis_type(text: str) -> Optional[CrisisType]:
|
|
"""Detect if the text indicates a crisis situation."""
|
|
text_lower = text.lower()
|
|
for crisis_type, data in CRISIS_RESPONSES.items():
|
|
for phrase in data["detection_phrases"]:
|
|
if phrase in text_lower:
|
|
return crisis_type
|
|
return None
|