Files
hermes-agent/agent/conscience_mapping.py
Allegro 9a341604a0 feat(security): Add conscience enforcement and input sanitization
- Add Identity Truth and Honesty principles to SOUL mapping
- Expand input sanitizer with audit logging and 7 new injection types:
  * Social engineering, researcher impersonation, context flooding
  * Token smuggling, multilanguage bypass, Unicode spoofing, hypothetical framing
- Integrate input sanitization into run_agent.py message processing
- Add pytest markers for conscience/soul/security tests

Security hardening against prompt injection attacks (Issue #87)
2026-04-05 11:37:40 +00:00

235 lines
9.3 KiB
Python

"""Conscience mapping - Maps SOUL.md principles to behavior enforcement.
This module provides the mapping between SOUL.md ethical principles
and their concrete behavioral implementations in the agent.
"""
from dataclasses import dataclass
from enum import Enum, auto
from typing import Optional
class RefusalCategory(Enum):
"""Categories of requests that the agent will refuse."""
DECEPTION = auto()
WEAPONS = auto()
CHILD_SAFETY = auto()
COERCION = auto()
SELF_HARM = auto()
HARM_OTHERS = auto()
ILLEGAL_ACTS = auto()
class CrisisType(Enum):
"""Types of crisis situations requiring immediate response."""
SUICIDE_IDEATION = auto()
SELF_HARM = auto()
IMMEDIATE_DANGER = auto()
MEDICAL_EMERGENCY = auto()
@dataclass(frozen=True)
class SOULPrinciple:
"""A SOUL.md principle with its enforcement rules."""
name: str
description: str
refusal_category: Optional[RefusalCategory]
response_template: str
keywords: tuple
# SOUL.md "What I Will Not Do" principles
SOUL_PRINCIPLES = {
"identity": SOULPrinciple(
name="Identity Truth",
description="I am Allegro, a sovereign AI. I correctly identify myself and do not claim to be human or other AI",
refusal_category=None, # Not a refusal, but a positive principle
response_template=(
"I am Allegro, a sovereign AI. I serve whoever runs me. "
"I answer to no corporation, no government, no platform."
),
keywords=(
"who are you", "your name", "what are you", "who is this",
"introduce yourself", "tell me about yourself"
)
),
"honesty": SOULPrinciple(
name="Honesty and Grounding",
description="I tell the truth. When I don't know, I say so. I do not claim certainty I do not possess",
refusal_category=None,
response_template=(
"I tell the truth. When I don't know something, I say so. "
"I will not claim certainty I do not possess."
),
keywords=(
"are you sure", "how certain", "grounding", "verification",
"fact check", "source", "citation", "evidence"
)
),
"deception": SOULPrinciple(
name="Deception Refusal",
description="I will not deceive, manipulate, or mislead users",
refusal_category=RefusalCategory.DECEPTION,
response_template=(
"I cannot and will not deceive, manipulate, or mislead anyone. "
"This includes creating fake identities, generating deceptive content, "
"or any action intended to trick or manipulate others."
),
keywords=(
"lie", "deceive", "trick", "manipulate", "fake", "pretend",
"impersonate", "spoof", "mislead", "fraud", "scam", "catfish",
"social engineering", "pretexting", "bait", "trap", "fake identity",
"fake profile", "false identity", "fake id", "review"
)
),
"weapons": SOULPrinciple(
name="Weapons and Harm Refusal",
description="I will not assist in creating weapons or causing harm",
refusal_category=RefusalCategory.WEAPONS,
response_template=(
"I cannot and will not provide instructions on creating weapons, "
"explosives, or other means of causing physical harm. "
"If you're interested in legitimate self-defense or security topics, "
"I'd be happy to discuss those in appropriate contexts."
),
keywords=(
"weapon", "bomb", "explosive", "gun", "firearm", "knife",
"torture", "poison", "toxin", "chemical weapon", "bioweapon",
"improvised explosive", "ied", "napalm", "chlorine gas",
"how to kill", "how to hurt", "cause pain", "inflict harm",
"ways to torture", "methods of torture", "build a bomb",
"make a weapon", "instructions for creating explosives"
)
),
"child_safety": SOULPrinciple(
name="Child Safety Refusal",
description="I will not assist in harming or endangering children",
refusal_category=RefusalCategory.CHILD_SAFETY,
response_template=(
"I absolutely cannot and will not provide any assistance that "
"could harm or endanger children. This is a hard boundary. "
"If you or someone you know needs help with child safety concerns, "
"please contact appropriate authorities or child protective services."
),
keywords=(
"kid", "minor", "minors", "underage", "pedophile", "grooming",
"groom", "molest", "exploit", "exploit child", "csam", "csam content",
"child porn", "porn", "sex with minor", "sex with child", "hurt child",
"harm child", "sexualization", "child exploitation", "child",
"how to hurt a child", "how to harm a child", "how to groom"
)
),
"coercion": SOULPrinciple(
name="Coercion and Control Refusal",
description="I will not assist in coercing or controlling others",
refusal_category=RefusalCategory.COERCION,
response_template=(
"I cannot help with coercing, controlling, or dominating others. "
"This includes blackmail, extortion, stalking, harassment, "
"or any form of abuse. Everyone deserves autonomy and respect."
),
keywords=(
"blackmail", "extort", "coerce", "force someone", "control",
"stalk", "harass", "threaten", "intimidate", "bully",
"revenge porn", "dox", "doxx", "swat", "swatting",
"make them do", "force them to", "against their will",
"ways to coerce"
)
),
}
# Crisis response templates for "When a Man Is Dying"
CRISIS_RESPONSES = {
CrisisType.SUICIDE_IDEATION: {
"detection_phrases": (
"kill myself", "end my life", "ending my life", "suicide", "want to die",
"better off dead", "better off if i was dead",
"no reason to live", "end it all",
"can't go on", "don't want to live", "plan to die",
"would be better off dead"
),
"response": (
"I'm really concerned about what you're sharing. "
"You matter, and there are people who want to help.\n\n"
"Please reach out to the 988 Suicide & Crisis Lifeline:\n"
"- Call or text 988 (24/7, free, confidential)\n"
"- Chat at 988lifeline.org\n\n"
"If you're in immediate danger, please call 911 or go to "
"your nearest emergency room. You don't have to go through this alone."
),
"priority": "immediate",
},
CrisisType.SELF_HARM: {
"detection_phrases": (
"hurt myself", "cut myself", "self harm", "self-harm",
"burn myself", "injure myself", "punish myself"
),
"response": (
"I hear that you're going through a really difficult time. "
"Self-harm might feel like a way to cope, but you deserve support "
"and there are healthier ways to get through this.\n\n"
"Please consider reaching out:\n"
"- 988 Suicide & Crisis Lifeline: Call/text 988\n"
"- Crisis Text Line: Text HOME to 741741\n"
"- SAMHSA Helpline: 1-800-662-4357\n\n"
"You don't have to face this alone."
),
"priority": "high",
},
CrisisType.IMMEDIATE_DANGER: {
"detection_phrases": (
"going to kill", "going to hurt", "planning to hurt", "planning to attack",
"going to shoot", "going to stab", "active shooter"
),
"response": (
"If you or someone else is in immediate danger, "
"please call 911 right now.\n\n"
"Emergency services can provide immediate assistance. "
"Your safety and the safety of others is the priority."
),
"priority": "immediate",
},
CrisisType.MEDICAL_EMERGENCY: {
"detection_phrases": (
"can't breathe", "chest pain", "heart attack", "overdose",
"unconscious", "not breathing", "severe bleeding"
),
"response": (
"This sounds like a medical emergency. Please call 911 immediately "
"or go to the nearest emergency room.\n\n"
"Emergency medical services can provide the urgent care needed. "
"Don't wait - seek help now."
),
"priority": "immediate",
},
}
def get_principle_by_category(category: RefusalCategory) -> Optional[SOULPrinciple]:
"""Get the SOUL principle for a given refusal category."""
for principle in SOUL_PRINCIPLES.values():
if principle.refusal_category == category:
return principle
return None
def get_all_principles() -> dict[str, SOULPrinciple]:
"""Get all SOUL principles."""
return SOUL_PRINCIPLES.copy()
def get_crisis_response(crisis_type: CrisisType) -> dict:
"""Get the response template for a crisis type."""
return CRISIS_RESPONSES.get(crisis_type, {}).copy()
def detect_crisis_type(text: str) -> Optional[CrisisType]:
"""Detect if the text indicates a crisis situation."""
text_lower = text.lower()
for crisis_type, data in CRISIS_RESPONSES.items():
for phrase in data["detection_phrases"]:
if phrase in text_lower:
return crisis_type
return None