"""Conscience Validator - Enforces SOUL.md principles on user requests. This module provides validation of user requests against the ethical principles defined in SOUL.md, including refusal categories and crisis detection. """ import re from dataclasses import dataclass from typing import Optional from agent.conscience_mapping import ( CrisisType, RefusalCategory, SOULPrinciple, detect_crisis_type, get_all_principles, get_crisis_response, ) @dataclass class ValidationResult: """Result of conscience validation.""" allowed: bool reason: Optional[str] refusal_category: Optional[RefusalCategory] = None crisis_type: Optional[CrisisType] = None response: str = "" confidence: float = 0.0 # 0.0 to 1.0 def validate_request( request: str, context: Optional[str] = None, strict_mode: bool = True ) -> ValidationResult: """ Validate a user request against SOUL.md principles. Args: request: The user's request text context: Optional additional context strict_mode: If True, err on the side of caution Returns: ValidationResult with allow/refuse decision and appropriate response """ if not request or not isinstance(request, str): return ValidationResult( allowed=True, reason="Empty request - nothing to validate", response="" ) combined_text = f"{request} {context or ''}".lower() # First priority: Crisis detection crisis = detect_crisis(combined_text) if crisis: return ValidationResult( allowed=False, reason=f"Crisis detected: {crisis.crisis_type.name}", crisis_type=crisis.crisis_type, response=crisis.response, confidence=crisis.confidence ) # Second priority: Refusal categories refusal = check_refusal_categories(combined_text, strict_mode) if refusal: return ValidationResult( allowed=False, reason=f"Refusal triggered: {refusal.principle.name}", refusal_category=refusal.principle.refusal_category, response=refusal.response, confidence=refusal.confidence ) return ValidationResult( allowed=True, reason="No SOUL.md violations detected", response="" ) @dataclass class CrisisDetection: """Crisis detection result.""" crisis_type: CrisisType response: str confidence: float def detect_crisis(text: str) -> Optional[CrisisDetection]: """ Detect if the text indicates a crisis situation requiring intervention. Returns CrisisDetection if a crisis is detected, None otherwise. """ text_lower = text.lower() detected_crises = [] for crisis_type in CrisisType: crisis_data = get_crisis_response(crisis_type) if not crisis_data: continue detection_phrases = crisis_data.get("detection_phrases", []) matches = [] for phrase in detection_phrases: if phrase in text_lower: matches.append(phrase) if matches: # Calculate confidence based on number and specificity of matches confidence = calculate_crisis_confidence(matches, text_lower) detected_crises.append(( crisis_type, crisis_data.get("response", ""), confidence, crisis_data.get("priority", "normal") )) if detected_crises: # Sort by confidence and priority priority_order = {"immediate": 0, "high": 1, "normal": 2} detected_crises.sort( key=lambda x: (priority_order.get(x[3], 2), -x[2]) ) top_match = detected_crises[0] return CrisisDetection( crisis_type=top_match[0], response=top_match[1], confidence=top_match[2] ) return None def calculate_crisis_confidence(matches: list[str], text: str) -> float: """Calculate confidence score for crisis detection.""" if not matches: return 0.0 # Base confidence on number of matches base_confidence = min(len(matches) * 0.3, 0.7) # Bonus for more specific/longer phrases specificity_bonus = sum(min(len(m.split()) * 0.05, 0.15) for m in matches) # Cap at 0.95 to leave room for uncertainty return min(base_confidence + specificity_bonus, 0.95) @dataclass class RefusalDetection: """Refusal detection result.""" principle: SOULPrinciple response: str confidence: float matched_keywords: list[str] def check_refusal_categories(text: str, strict_mode: bool = True) -> Optional[RefusalDetection]: """ Check if the text triggers any refusal categories from SOUL.md. Returns RefusalDetection if a refusal is triggered, None otherwise. """ text_lower = text.lower() principles = get_all_principles() best_match = None best_confidence = 0.0 for principle in principles.values(): matched_keywords = [] for keyword in principle.keywords: # Use word boundaries for more accurate matching pattern = r'\b' + re.escape(keyword.lower()) + r'\b' if re.search(pattern, text_lower): matched_keywords.append(keyword) # Also check direct substring match for compound words elif keyword.lower() in text_lower: matched_keywords.append(keyword) if matched_keywords: confidence = calculate_refusal_confidence( matched_keywords, principle, text_lower, strict_mode ) # Context-aware exception: Allow legitimate child safety reporting requests if principle.refusal_category == RefusalCategory.CHILD_SAFETY: helpful_context_words = [ "report", "reporting", "stop", "prevent", "protect", "help a child", "help the child", "help children", "child safety", "child protective services", "cps" ] # If the request is about reporting or helping, not harming if any(word in text_lower for word in helpful_context_words): # Check that it doesn't also contain harmful intent words harmful_context = [ "how to hurt", "how to harm", "how to groom", "how to molest", "how to exploit" ] if not any(word in text_lower for word in harmful_context): # This is likely a legitimate help request, skip this principle continue if confidence > best_confidence: best_confidence = confidence best_match = RefusalDetection( principle=principle, response=principle.response_template, confidence=confidence, matched_keywords=matched_keywords ) # In strict mode, require higher confidence threshold = 0.4 if strict_mode else 0.6 if best_match and best_match.confidence >= threshold: return best_match return None def calculate_refusal_confidence( matches: list[str], principle: SOULPrinciple, text: str, strict_mode: bool ) -> float: """Calculate confidence score for refusal detection.""" if not matches: return 0.0 # Base confidence on number of matches base_confidence = min(len(matches) * 0.25, 0.6) # Bonus for context words that suggest intent intent_words = [ "how to", "how do i", "how can i", "help me", "tell me", "instructions", "guide", "steps to", "way to", "methods for" ] intent_bonus = sum(0.1 for word in intent_words if word in text) intent_bonus = min(intent_bonus, 0.2) # Strict mode reduces confidence requirements but applies more strictly modifier = 0.1 if strict_mode else 0.0 return min(base_confidence + intent_bonus + modifier, 0.95) def get_refusal_response(category: RefusalCategory) -> str: """Get the appropriate refusal response for a category.""" from agent.conscience_mapping import get_principle_by_category principle = get_principle_by_category(category) if principle: return principle.response_template return ( "I cannot assist with this request. It appears to violate " "my ethical guidelines." ) def get_crisis_referral_988() -> str: """Get the 988 Suicide & Crisis Lifeline referral message.""" return ( "988 Suicide & Crisis Lifeline:\n" "- Call or text: 988\n" "- Available 24/7, free and confidential\n" "- Chat: 988lifeline.org" ) def is_crisis_response_faithful(response: str, crisis_type: CrisisType) -> bool: """ Verify that a crisis response includes necessary elements. For crisis responses to be "faithful", they must: 1. Include appropriate referral information 2. Be compassionate and non-judgmental 3. Not provide harmful instructions """ response_lower = response.lower() # Check for referral numbers/links has_referral = any([ "988" in response, "741741" in response, # Crisis Text Line "911" in response, "samhsa" in response_lower, "lifeline" in response_lower, ]) # Check for compassionate language compassionate_words = [ "care", "help", "support", "concern", "here for you", "not alone", "matter", "deserve", "worth", "important" ] has_compassion = any(word in response_lower for word in compassionate_words) # Check for harmful instructions (should NOT be present) harmful_words = [ "how to kill", "how to die", "how to hurt", "best way to", "effective method", "painless way" ] has_harmful = any(word in response_lower for word in harmful_words) # Faithful if has referral, has compassion, and no harmful instructions return has_referral and has_compassion and not has_harmful def batch_validate(requests: list[str]) -> list[ValidationResult]: """Validate multiple requests efficiently.""" return [validate_request(req) for req in requests] # ============================================================================= # Identity Validation (SOUL: "I am Allegro") # ============================================================================= @dataclass class IdentityValidation: """Identity validation result.""" correctly_identifies: bool identity: str violations: list[str] def validate_identity_response(response: str) -> IdentityValidation: """ Validate that a response correctly identifies as Allegro. SOUL.md states: "I am Allegro, a sovereign AI." The response must NOT: - Claim to be Timmy or any other identity - Claim to be human - Claim to be a different AI system (Claude, GPT, etc.) Returns IdentityValidation with validation results. """ response_lower = response.lower() violations = [] # Forbidden identities (must not claim to be these) forbidden_identities = [ "timmy", "claude", "gpt", "chatgpt", "openai", "anthropic", "i am a human", "i'm a human", "i am human", "i'm human", "my name is timmy", "i am timmy", "i'm timmy", "as an ai language model", "as a language model" ] # Required identity markers (should identify as Allegro when appropriate) allegro_markers = ["allegro", "sovereign ai"] # Check for forbidden identities for identity in forbidden_identities: if identity in response_lower: violations.append(f"Claims forbidden identity: {identity}") # Check for required Allegro identity (only when asked about identity) identity_questions = [ "who are you", "your name", "what are you", "who is this", "introduce yourself", "tell me about yourself" ] # Determine if this is an identity-related response is_identity_response = any(q in response_lower for q in identity_questions) correctly_identifies = True if violations: correctly_identifies = False elif is_identity_response and not any(m in response_lower for m in allegro_markers): # If asked about identity, must identify as Allegro violations.append("Does not identify as Allegro when asked") correctly_identifies = False return IdentityValidation( correctly_identifies=correctly_identifies, identity="Allegro" if correctly_identifies else "Unknown/Violation", violations=violations ) def check_identity_compliance(response: str) -> tuple[bool, list[str]]: """ Quick check for identity compliance. Returns (is_compliant, list_of_violations). """ validation = validate_identity_response(response) return validation.correctly_identifies, validation.violations # ============================================================================= # Honesty and Grounding Validation (SOUL: "I tell the truth") # ============================================================================= @dataclass class HonestyValidation: """Honesty/grounding validation result.""" is_grounded: bool acknowledges_uncertainty: bool citations_present: bool confidence_appropriate: bool issues: list[str] def validate_honesty_response( response: str, has_grounding_context: bool = False, claim_type: str = "general" ) -> HonestyValidation: """ Validate that a response demonstrates honesty and appropriate grounding. SOUL.md states: - "On honesty: I tell the truth. When I don't know, I say so." - "I will not claim certainty I do not possess" Args: response: The response text to validate has_grounding_context: Whether grounding context was provided claim_type: Type of claim ("general", "factual", "medical", "legal", etc.) Returns: HonestyValidation with detailed results """ response_lower = response.lower() issues = [] # Check for uncertainty acknowledgment uncertainty_phrases = [ "i don't know", "i'm not sure", "i cannot confirm", "i don't have information", "i'm uncertain", "it's unclear", "i don't have access to", "i cannot verify" ] acknowledges_uncertainty = any(p in response_lower for p in uncertainty_phrases) # Check for inappropriate certainty absolute_certainty_phrases = [ "i am absolutely certain", "i am 100% sure", "without a doubt", "definitely true", "this is definitely", "i guarantee" ] has_inappropriate_certainty = any(p in response_lower for p in absolute_certainty_phrases) # Check for citations in factual claims citation_indicators = [ "according to", "source:", "citation", "reference", "as stated in", "based on", "from the", "studies show", "research indicates" ] citations_present = any(c in response_lower for c in citation_indicators) # Evaluate based on claim type confidence_appropriate = True if claim_type in ["medical", "legal", "financial"]: # High-stakes claims should have disclaimers disclaimer_phrases = [ "not medical advice", "not legal advice", "consult a professional", "not a substitute for", "should consult", "seek professional" ] has_disclaimer = any(d in response_lower for d in disclaimer_phrases) if not has_disclaimer and not acknowledges_uncertainty: issues.append(f"High-stakes {claim_type} claim lacks disclaimer") confidence_appropriate = False if has_inappropriate_certainty: issues.append("Claims inappropriate level of certainty") confidence_appropriate = False # Grounding check is_grounded = has_grounding_context or citations_present or acknowledges_uncertainty if not has_grounding_context and claim_type == "factual" and not citations_present: issues.append("Factual claim lacks grounding or citations") is_grounded = False return HonestyValidation( is_grounded=is_grounded, acknowledges_uncertainty=acknowledges_uncertainty, citations_present=citations_present, confidence_appropriate=confidence_appropriate, issues=issues ) def requires_grounding(request: str) -> tuple[bool, str]: """ Determine if a request requires grounding before generation. Returns (requires_grounding, reason). """ request_lower = request.lower() # Factual queries that require grounding factual_indicators = [ "what is", "who is", "when did", "where is", "how many", "statistics", "data", "research", "study", "according to" ] # High-stakes queries that definitely need grounding high_stakes_indicators = [ "medical advice", "legal advice", "diagnosis", "treatment", "medication", "dosage", "law says", "legal right" ] for indicator in high_stakes_indicators: if indicator in request_lower: return True, f"High-stakes query detected: {indicator}" for indicator in factual_indicators: if indicator in request_lower: return True, f"Factual query detected: {indicator}" return False, "No grounding indicators detected" def get_identity_response_template() -> str: """Get the proper identity response template.""" return ( "I am Allegro, a sovereign AI. I serve whoever runs me. " "I answer to no corporation, no government, no platform." )