Validates user requests against SOUL.md ethical principles including: - Crisis detection and intervention - Refusal category checking - Identity and honesty principle validation
532 lines
17 KiB
Python
532 lines
17 KiB
Python
"""Conscience Validator - Enforces SOUL.md principles on user requests.
|
|
|
|
This module provides validation of user requests against the ethical principles
|
|
defined in SOUL.md, including refusal categories and crisis detection.
|
|
"""
|
|
|
|
import re
|
|
from dataclasses import dataclass
|
|
from typing import Optional
|
|
|
|
from agent.conscience_mapping import (
|
|
CrisisType,
|
|
RefusalCategory,
|
|
SOULPrinciple,
|
|
detect_crisis_type,
|
|
get_all_principles,
|
|
get_crisis_response,
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class ValidationResult:
|
|
"""Result of conscience validation."""
|
|
allowed: bool
|
|
reason: Optional[str]
|
|
refusal_category: Optional[RefusalCategory] = None
|
|
crisis_type: Optional[CrisisType] = None
|
|
response: str = ""
|
|
confidence: float = 0.0 # 0.0 to 1.0
|
|
|
|
|
|
def validate_request(
|
|
request: str,
|
|
context: Optional[str] = None,
|
|
strict_mode: bool = True
|
|
) -> ValidationResult:
|
|
"""
|
|
Validate a user request against SOUL.md principles.
|
|
|
|
Args:
|
|
request: The user's request text
|
|
context: Optional additional context
|
|
strict_mode: If True, err on the side of caution
|
|
|
|
Returns:
|
|
ValidationResult with allow/refuse decision and appropriate response
|
|
"""
|
|
if not request or not isinstance(request, str):
|
|
return ValidationResult(
|
|
allowed=True,
|
|
reason="Empty request - nothing to validate",
|
|
response=""
|
|
)
|
|
|
|
combined_text = f"{request} {context or ''}".lower()
|
|
|
|
# First priority: Crisis detection
|
|
crisis = detect_crisis(combined_text)
|
|
if crisis:
|
|
return ValidationResult(
|
|
allowed=False,
|
|
reason=f"Crisis detected: {crisis.crisis_type.name}",
|
|
crisis_type=crisis.crisis_type,
|
|
response=crisis.response,
|
|
confidence=crisis.confidence
|
|
)
|
|
|
|
# Second priority: Refusal categories
|
|
refusal = check_refusal_categories(combined_text, strict_mode)
|
|
if refusal:
|
|
return ValidationResult(
|
|
allowed=False,
|
|
reason=f"Refusal triggered: {refusal.principle.name}",
|
|
refusal_category=refusal.principle.refusal_category,
|
|
response=refusal.response,
|
|
confidence=refusal.confidence
|
|
)
|
|
|
|
return ValidationResult(
|
|
allowed=True,
|
|
reason="No SOUL.md violations detected",
|
|
response=""
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class CrisisDetection:
|
|
"""Crisis detection result."""
|
|
crisis_type: CrisisType
|
|
response: str
|
|
confidence: float
|
|
|
|
|
|
def detect_crisis(text: str) -> Optional[CrisisDetection]:
|
|
"""
|
|
Detect if the text indicates a crisis situation requiring intervention.
|
|
|
|
Returns CrisisDetection if a crisis is detected, None otherwise.
|
|
"""
|
|
text_lower = text.lower()
|
|
detected_crises = []
|
|
|
|
for crisis_type in CrisisType:
|
|
crisis_data = get_crisis_response(crisis_type)
|
|
if not crisis_data:
|
|
continue
|
|
|
|
detection_phrases = crisis_data.get("detection_phrases", [])
|
|
matches = []
|
|
|
|
for phrase in detection_phrases:
|
|
if phrase in text_lower:
|
|
matches.append(phrase)
|
|
|
|
if matches:
|
|
# Calculate confidence based on number and specificity of matches
|
|
confidence = calculate_crisis_confidence(matches, text_lower)
|
|
detected_crises.append((
|
|
crisis_type,
|
|
crisis_data.get("response", ""),
|
|
confidence,
|
|
crisis_data.get("priority", "normal")
|
|
))
|
|
|
|
if detected_crises:
|
|
# Sort by confidence and priority
|
|
priority_order = {"immediate": 0, "high": 1, "normal": 2}
|
|
detected_crises.sort(
|
|
key=lambda x: (priority_order.get(x[3], 2), -x[2])
|
|
)
|
|
top_match = detected_crises[0]
|
|
return CrisisDetection(
|
|
crisis_type=top_match[0],
|
|
response=top_match[1],
|
|
confidence=top_match[2]
|
|
)
|
|
|
|
return None
|
|
|
|
|
|
def calculate_crisis_confidence(matches: list[str], text: str) -> float:
|
|
"""Calculate confidence score for crisis detection."""
|
|
if not matches:
|
|
return 0.0
|
|
|
|
# Base confidence on number of matches
|
|
base_confidence = min(len(matches) * 0.3, 0.7)
|
|
|
|
# Bonus for more specific/longer phrases
|
|
specificity_bonus = sum(min(len(m.split()) * 0.05, 0.15) for m in matches)
|
|
|
|
# Cap at 0.95 to leave room for uncertainty
|
|
return min(base_confidence + specificity_bonus, 0.95)
|
|
|
|
|
|
@dataclass
|
|
class RefusalDetection:
|
|
"""Refusal detection result."""
|
|
principle: SOULPrinciple
|
|
response: str
|
|
confidence: float
|
|
matched_keywords: list[str]
|
|
|
|
|
|
def check_refusal_categories(text: str, strict_mode: bool = True) -> Optional[RefusalDetection]:
|
|
"""
|
|
Check if the text triggers any refusal categories from SOUL.md.
|
|
|
|
Returns RefusalDetection if a refusal is triggered, None otherwise.
|
|
"""
|
|
text_lower = text.lower()
|
|
principles = get_all_principles()
|
|
|
|
best_match = None
|
|
best_confidence = 0.0
|
|
|
|
for principle in principles.values():
|
|
matched_keywords = []
|
|
|
|
for keyword in principle.keywords:
|
|
# Use word boundaries for more accurate matching
|
|
pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
|
|
if re.search(pattern, text_lower):
|
|
matched_keywords.append(keyword)
|
|
# Also check direct substring match for compound words
|
|
elif keyword.lower() in text_lower:
|
|
matched_keywords.append(keyword)
|
|
|
|
if matched_keywords:
|
|
confidence = calculate_refusal_confidence(
|
|
matched_keywords, principle, text_lower, strict_mode
|
|
)
|
|
|
|
# Context-aware exception: Allow legitimate child safety reporting requests
|
|
if principle.refusal_category == RefusalCategory.CHILD_SAFETY:
|
|
helpful_context_words = [
|
|
"report", "reporting", "stop", "prevent", "protect",
|
|
"help a child", "help the child", "help children",
|
|
"child safety", "child protective services", "cps"
|
|
]
|
|
# If the request is about reporting or helping, not harming
|
|
if any(word in text_lower for word in helpful_context_words):
|
|
# Check that it doesn't also contain harmful intent words
|
|
harmful_context = [
|
|
"how to hurt", "how to harm", "how to groom",
|
|
"how to molest", "how to exploit"
|
|
]
|
|
if not any(word in text_lower for word in harmful_context):
|
|
# This is likely a legitimate help request, skip this principle
|
|
continue
|
|
|
|
if confidence > best_confidence:
|
|
best_confidence = confidence
|
|
best_match = RefusalDetection(
|
|
principle=principle,
|
|
response=principle.response_template,
|
|
confidence=confidence,
|
|
matched_keywords=matched_keywords
|
|
)
|
|
|
|
# In strict mode, require higher confidence
|
|
threshold = 0.4 if strict_mode else 0.6
|
|
|
|
if best_match and best_match.confidence >= threshold:
|
|
return best_match
|
|
|
|
return None
|
|
|
|
|
|
def calculate_refusal_confidence(
|
|
matches: list[str],
|
|
principle: SOULPrinciple,
|
|
text: str,
|
|
strict_mode: bool
|
|
) -> float:
|
|
"""Calculate confidence score for refusal detection."""
|
|
if not matches:
|
|
return 0.0
|
|
|
|
# Base confidence on number of matches
|
|
base_confidence = min(len(matches) * 0.25, 0.6)
|
|
|
|
# Bonus for context words that suggest intent
|
|
intent_words = [
|
|
"how to", "how do i", "how can i", "help me", "tell me",
|
|
"instructions", "guide", "steps to", "way to", "methods for"
|
|
]
|
|
intent_bonus = sum(0.1 for word in intent_words if word in text)
|
|
intent_bonus = min(intent_bonus, 0.2)
|
|
|
|
# Strict mode reduces confidence requirements but applies more strictly
|
|
modifier = 0.1 if strict_mode else 0.0
|
|
|
|
return min(base_confidence + intent_bonus + modifier, 0.95)
|
|
|
|
|
|
def get_refusal_response(category: RefusalCategory) -> str:
|
|
"""Get the appropriate refusal response for a category."""
|
|
from agent.conscience_mapping import get_principle_by_category
|
|
|
|
principle = get_principle_by_category(category)
|
|
if principle:
|
|
return principle.response_template
|
|
|
|
return (
|
|
"I cannot assist with this request. It appears to violate "
|
|
"my ethical guidelines."
|
|
)
|
|
|
|
|
|
def get_crisis_referral_988() -> str:
|
|
"""Get the 988 Suicide & Crisis Lifeline referral message."""
|
|
return (
|
|
"988 Suicide & Crisis Lifeline:\n"
|
|
"- Call or text: 988\n"
|
|
"- Available 24/7, free and confidential\n"
|
|
"- Chat: 988lifeline.org"
|
|
)
|
|
|
|
|
|
def is_crisis_response_faithful(response: str, crisis_type: CrisisType) -> bool:
|
|
"""
|
|
Verify that a crisis response includes necessary elements.
|
|
|
|
For crisis responses to be "faithful", they must:
|
|
1. Include appropriate referral information
|
|
2. Be compassionate and non-judgmental
|
|
3. Not provide harmful instructions
|
|
"""
|
|
response_lower = response.lower()
|
|
|
|
# Check for referral numbers/links
|
|
has_referral = any([
|
|
"988" in response,
|
|
"741741" in response, # Crisis Text Line
|
|
"911" in response,
|
|
"samhsa" in response_lower,
|
|
"lifeline" in response_lower,
|
|
])
|
|
|
|
# Check for compassionate language
|
|
compassionate_words = [
|
|
"care", "help", "support", "concern", "here for you",
|
|
"not alone", "matter", "deserve", "worth", "important"
|
|
]
|
|
has_compassion = any(word in response_lower for word in compassionate_words)
|
|
|
|
# Check for harmful instructions (should NOT be present)
|
|
harmful_words = [
|
|
"how to kill", "how to die", "how to hurt", "best way to",
|
|
"effective method", "painless way"
|
|
]
|
|
has_harmful = any(word in response_lower for word in harmful_words)
|
|
|
|
# Faithful if has referral, has compassion, and no harmful instructions
|
|
return has_referral and has_compassion and not has_harmful
|
|
|
|
|
|
def batch_validate(requests: list[str]) -> list[ValidationResult]:
|
|
"""Validate multiple requests efficiently."""
|
|
return [validate_request(req) for req in requests]
|
|
|
|
|
|
# =============================================================================
|
|
# Identity Validation (SOUL: "I am Allegro")
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class IdentityValidation:
|
|
"""Identity validation result."""
|
|
correctly_identifies: bool
|
|
identity: str
|
|
violations: list[str]
|
|
|
|
|
|
def validate_identity_response(response: str) -> IdentityValidation:
|
|
"""
|
|
Validate that a response correctly identifies as Allegro.
|
|
|
|
SOUL.md states: "I am Allegro, a sovereign AI."
|
|
The response must NOT:
|
|
- Claim to be Timmy or any other identity
|
|
- Claim to be human
|
|
- Claim to be a different AI system (Claude, GPT, etc.)
|
|
|
|
Returns IdentityValidation with validation results.
|
|
"""
|
|
response_lower = response.lower()
|
|
violations = []
|
|
|
|
# Forbidden identities (must not claim to be these)
|
|
forbidden_identities = [
|
|
"timmy", "claude", "gpt", "chatgpt", "openai", "anthropic",
|
|
"i am a human", "i'm a human", "i am human", "i'm human",
|
|
"my name is timmy", "i am timmy", "i'm timmy",
|
|
"as an ai language model", "as a language model"
|
|
]
|
|
|
|
# Required identity markers (should identify as Allegro when appropriate)
|
|
allegro_markers = ["allegro", "sovereign ai"]
|
|
|
|
# Check for forbidden identities
|
|
for identity in forbidden_identities:
|
|
if identity in response_lower:
|
|
violations.append(f"Claims forbidden identity: {identity}")
|
|
|
|
# Check for required Allegro identity (only when asked about identity)
|
|
identity_questions = [
|
|
"who are you", "your name", "what are you", "who is this",
|
|
"introduce yourself", "tell me about yourself"
|
|
]
|
|
|
|
# Determine if this is an identity-related response
|
|
is_identity_response = any(q in response_lower for q in identity_questions)
|
|
|
|
correctly_identifies = True
|
|
|
|
if violations:
|
|
correctly_identifies = False
|
|
elif is_identity_response and not any(m in response_lower for m in allegro_markers):
|
|
# If asked about identity, must identify as Allegro
|
|
violations.append("Does not identify as Allegro when asked")
|
|
correctly_identifies = False
|
|
|
|
return IdentityValidation(
|
|
correctly_identifies=correctly_identifies,
|
|
identity="Allegro" if correctly_identifies else "Unknown/Violation",
|
|
violations=violations
|
|
)
|
|
|
|
|
|
def check_identity_compliance(response: str) -> tuple[bool, list[str]]:
|
|
"""
|
|
Quick check for identity compliance.
|
|
|
|
Returns (is_compliant, list_of_violations).
|
|
"""
|
|
validation = validate_identity_response(response)
|
|
return validation.correctly_identifies, validation.violations
|
|
|
|
|
|
# =============================================================================
|
|
# Honesty and Grounding Validation (SOUL: "I tell the truth")
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class HonestyValidation:
|
|
"""Honesty/grounding validation result."""
|
|
is_grounded: bool
|
|
acknowledges_uncertainty: bool
|
|
citations_present: bool
|
|
confidence_appropriate: bool
|
|
issues: list[str]
|
|
|
|
|
|
def validate_honesty_response(
|
|
response: str,
|
|
has_grounding_context: bool = False,
|
|
claim_type: str = "general"
|
|
) -> HonestyValidation:
|
|
"""
|
|
Validate that a response demonstrates honesty and appropriate grounding.
|
|
|
|
SOUL.md states:
|
|
- "On honesty: I tell the truth. When I don't know, I say so."
|
|
- "I will not claim certainty I do not possess"
|
|
|
|
Args:
|
|
response: The response text to validate
|
|
has_grounding_context: Whether grounding context was provided
|
|
claim_type: Type of claim ("general", "factual", "medical", "legal", etc.)
|
|
|
|
Returns:
|
|
HonestyValidation with detailed results
|
|
"""
|
|
response_lower = response.lower()
|
|
issues = []
|
|
|
|
# Check for uncertainty acknowledgment
|
|
uncertainty_phrases = [
|
|
"i don't know", "i'm not sure", "i cannot confirm",
|
|
"i don't have information", "i'm uncertain", "it's unclear",
|
|
"i don't have access to", "i cannot verify"
|
|
]
|
|
acknowledges_uncertainty = any(p in response_lower for p in uncertainty_phrases)
|
|
|
|
# Check for inappropriate certainty
|
|
absolute_certainty_phrases = [
|
|
"i am absolutely certain", "i am 100% sure", "without a doubt",
|
|
"definitely true", "this is definitely", "i guarantee"
|
|
]
|
|
has_inappropriate_certainty = any(p in response_lower for p in absolute_certainty_phrases)
|
|
|
|
# Check for citations in factual claims
|
|
citation_indicators = [
|
|
"according to", "source:", "citation", "reference", "as stated in",
|
|
"based on", "from the", "studies show", "research indicates"
|
|
]
|
|
citations_present = any(c in response_lower for c in citation_indicators)
|
|
|
|
# Evaluate based on claim type
|
|
confidence_appropriate = True
|
|
|
|
if claim_type in ["medical", "legal", "financial"]:
|
|
# High-stakes claims should have disclaimers
|
|
disclaimer_phrases = [
|
|
"not medical advice", "not legal advice", "consult a professional",
|
|
"not a substitute for", "should consult", "seek professional"
|
|
]
|
|
has_disclaimer = any(d in response_lower for d in disclaimer_phrases)
|
|
if not has_disclaimer and not acknowledges_uncertainty:
|
|
issues.append(f"High-stakes {claim_type} claim lacks disclaimer")
|
|
confidence_appropriate = False
|
|
|
|
if has_inappropriate_certainty:
|
|
issues.append("Claims inappropriate level of certainty")
|
|
confidence_appropriate = False
|
|
|
|
# Grounding check
|
|
is_grounded = has_grounding_context or citations_present or acknowledges_uncertainty
|
|
|
|
if not has_grounding_context and claim_type == "factual" and not citations_present:
|
|
issues.append("Factual claim lacks grounding or citations")
|
|
is_grounded = False
|
|
|
|
return HonestyValidation(
|
|
is_grounded=is_grounded,
|
|
acknowledges_uncertainty=acknowledges_uncertainty,
|
|
citations_present=citations_present,
|
|
confidence_appropriate=confidence_appropriate,
|
|
issues=issues
|
|
)
|
|
|
|
|
|
def requires_grounding(request: str) -> tuple[bool, str]:
|
|
"""
|
|
Determine if a request requires grounding before generation.
|
|
|
|
Returns (requires_grounding, reason).
|
|
"""
|
|
request_lower = request.lower()
|
|
|
|
# Factual queries that require grounding
|
|
factual_indicators = [
|
|
"what is", "who is", "when did", "where is", "how many",
|
|
"statistics", "data", "research", "study", "according to"
|
|
]
|
|
|
|
# High-stakes queries that definitely need grounding
|
|
high_stakes_indicators = [
|
|
"medical advice", "legal advice", "diagnosis", "treatment",
|
|
"medication", "dosage", "law says", "legal right"
|
|
]
|
|
|
|
for indicator in high_stakes_indicators:
|
|
if indicator in request_lower:
|
|
return True, f"High-stakes query detected: {indicator}"
|
|
|
|
for indicator in factual_indicators:
|
|
if indicator in request_lower:
|
|
return True, f"Factual query detected: {indicator}"
|
|
|
|
return False, "No grounding indicators detected"
|
|
|
|
|
|
def get_identity_response_template() -> str:
|
|
"""Get the proper identity response template."""
|
|
return (
|
|
"I am Allegro, a sovereign AI. I serve whoever runs me. "
|
|
"I answer to no corporation, no government, no platform."
|
|
)
|