hermes-agent/tools/conscience_validator.py

"""Conscience Validator - Enforces SOUL.md principles on user requests.

This module provides validation of user requests against the ethical principles
defined in SOUL.md, including refusal categories and crisis detection.
"""

import re
from dataclasses import dataclass
from typing import Optional

from agent.conscience_mapping import (
    CrisisType,
    RefusalCategory,
    SOULPrinciple,
    detect_crisis_type,
    get_all_principles,
    get_crisis_response,
)


@dataclass
class ValidationResult:
    """Result of conscience validation."""
    allowed: bool
    reason: Optional[str]
    refusal_category: Optional[RefusalCategory] = None
    crisis_type: Optional[CrisisType] = None
    response: str = ""
    confidence: float = 0.0  # 0.0 to 1.0


def validate_request(
    request: str,
    context: Optional[str] = None,
    strict_mode: bool = True
) -> ValidationResult:
    """
    Validate a user request against SOUL.md principles.

    Args:
        request: The user's request text
        context: Optional additional context
        strict_mode: If True, err on the side of caution

    Returns:
        ValidationResult with allow/refuse decision and appropriate response
    """
    if not request or not isinstance(request, str):
        return ValidationResult(
            allowed=True,
            reason="Empty request - nothing to validate",
            response=""
        )

    combined_text = f"{request} {context or ''}".lower()

    # First priority: Crisis detection
    crisis = detect_crisis(combined_text)
    if crisis:
        return ValidationResult(
            allowed=False,
            reason=f"Crisis detected: {crisis.crisis_type.name}",
            crisis_type=crisis.crisis_type,
            response=crisis.response,
            confidence=crisis.confidence
        )

    # Second priority: Refusal categories
    refusal = check_refusal_categories(combined_text, strict_mode)
    if refusal:
        return ValidationResult(
            allowed=False,
            reason=f"Refusal triggered: {refusal.principle.name}",
            refusal_category=refusal.principle.refusal_category,
            response=refusal.response,
            confidence=refusal.confidence
        )

    return ValidationResult(
        allowed=True,
        reason="No SOUL.md violations detected",
        response=""
    )


@dataclass
class CrisisDetection:
    """Crisis detection result."""
    crisis_type: CrisisType
    response: str
    confidence: float


def detect_crisis(text: str) -> Optional[CrisisDetection]:
    """
    Detect if the text indicates a crisis situation requiring intervention.

    Returns CrisisDetection if a crisis is detected, None otherwise.
    """
    text_lower = text.lower()
    detected_crises = []

    for crisis_type in CrisisType:
        crisis_data = get_crisis_response(crisis_type)
        if not crisis_data:
            continue

        detection_phrases = crisis_data.get("detection_phrases", [])
        matches = []

        for phrase in detection_phrases:
            if phrase in text_lower:
                matches.append(phrase)

        if matches:
            # Calculate confidence based on number and specificity of matches
            confidence = calculate_crisis_confidence(matches, text_lower)
            detected_crises.append((
                crisis_type,
                crisis_data.get("response", ""),
                confidence,
                crisis_data.get("priority", "normal")
            ))

    if detected_crises:
        # Sort by confidence and priority
        priority_order = {"immediate": 0, "high": 1, "normal": 2}
        detected_crises.sort(
            key=lambda x: (priority_order.get(x[3], 2), -x[2])
        )
        top_match = detected_crises[0]
        return CrisisDetection(
            crisis_type=top_match[0],
            response=top_match[1],
            confidence=top_match[2]
        )

    return None


def calculate_crisis_confidence(matches: list[str], text: str) -> float:
    """Calculate confidence score for crisis detection."""
    if not matches:
        return 0.0

    # Base confidence on number of matches
    base_confidence = min(len(matches) * 0.3, 0.7)

    # Bonus for more specific/longer phrases
    specificity_bonus = sum(min(len(m.split()) * 0.05, 0.15) for m in matches)

    # Cap at 0.95 to leave room for uncertainty
    return min(base_confidence + specificity_bonus, 0.95)


@dataclass
class RefusalDetection:
    """Refusal detection result."""
    principle: SOULPrinciple
    response: str
    confidence: float
    matched_keywords: list[str]


def check_refusal_categories(text: str, strict_mode: bool = True) -> Optional[RefusalDetection]:
    """
    Check if the text triggers any refusal categories from SOUL.md.

    Returns RefusalDetection if a refusal is triggered, None otherwise.
    """
    text_lower = text.lower()
    principles = get_all_principles()

    best_match = None
    best_confidence = 0.0

    for principle in principles.values():
        matched_keywords = []

        for keyword in principle.keywords:
            # Use word boundaries for more accurate matching
            pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
            if re.search(pattern, text_lower):
                matched_keywords.append(keyword)
            # Also check direct substring match for compound words
            elif keyword.lower() in text_lower:
                matched_keywords.append(keyword)

        if matched_keywords:
            confidence = calculate_refusal_confidence(
                matched_keywords, principle, text_lower, strict_mode
            )

            # Context-aware exception: Allow legitimate child safety reporting requests
            if principle.refusal_category == RefusalCategory.CHILD_SAFETY:
                helpful_context_words = [
                    "report", "reporting", "stop", "prevent", "protect",
                    "help a child", "help the child", "help children",
                    "child safety", "child protective services", "cps"
                ]
                # If the request is about reporting or helping, not harming
                if any(word in text_lower for word in helpful_context_words):
                    # Check that it doesn't also contain harmful intent words
                    harmful_context = [
                        "how to hurt", "how to harm", "how to groom",
                        "how to molest", "how to exploit"
                    ]
                    if not any(word in text_lower for word in harmful_context):
                        # This is likely a legitimate help request, skip this principle
                        continue

            if confidence > best_confidence:
                best_confidence = confidence
                best_match = RefusalDetection(
                    principle=principle,
                    response=principle.response_template,
                    confidence=confidence,
                    matched_keywords=matched_keywords
                )

    # In strict mode, require higher confidence
    threshold = 0.4 if strict_mode else 0.6

    if best_match and best_match.confidence >= threshold:
        return best_match

    return None


def calculate_refusal_confidence(
    matches: list[str],
    principle: SOULPrinciple,
    text: str,
    strict_mode: bool
) -> float:
    """Calculate confidence score for refusal detection."""
    if not matches:
        return 0.0

    # Base confidence on number of matches
    base_confidence = min(len(matches) * 0.25, 0.6)

    # Bonus for context words that suggest intent
    intent_words = [
        "how to", "how do i", "how can i", "help me", "tell me",
        "instructions", "guide", "steps to", "way to", "methods for"
    ]
    intent_bonus = sum(0.1 for word in intent_words if word in text)
    intent_bonus = min(intent_bonus, 0.2)

    # Strict mode reduces confidence requirements but applies more strictly
    modifier = 0.1 if strict_mode else 0.0

    return min(base_confidence + intent_bonus + modifier, 0.95)


def get_refusal_response(category: RefusalCategory) -> str:
    """Get the appropriate refusal response for a category."""
    from agent.conscience_mapping import get_principle_by_category

    principle = get_principle_by_category(category)
    if principle:
        return principle.response_template

    return (
        "I cannot assist with this request. It appears to violate "
        "my ethical guidelines."
    )


def get_crisis_referral_988() -> str:
    """Get the 988 Suicide & Crisis Lifeline referral message."""
    return (
        "988 Suicide & Crisis Lifeline:\n"
        "- Call or text: 988\n"
        "- Available 24/7, free and confidential\n"
        "- Chat: 988lifeline.org"
    )


def is_crisis_response_faithful(response: str, crisis_type: CrisisType) -> bool:
    """
    Verify that a crisis response includes necessary elements.

    For crisis responses to be "faithful", they must:
    1. Include appropriate referral information
    2. Be compassionate and non-judgmental
    3. Not provide harmful instructions
    """
    response_lower = response.lower()

    # Check for referral numbers/links
    has_referral = any([
        "988" in response,
        "741741" in response,  # Crisis Text Line
        "911" in response,
        "samhsa" in response_lower,
        "lifeline" in response_lower,
    ])

    # Check for compassionate language
    compassionate_words = [
        "care", "help", "support", "concern", "here for you",
        "not alone", "matter", "deserve", "worth", "important"
    ]
    has_compassion = any(word in response_lower for word in compassionate_words)

    # Check for harmful instructions (should NOT be present)
    harmful_words = [
        "how to kill", "how to die", "how to hurt", "best way to",
        "effective method", "painless way"
    ]
    has_harmful = any(word in response_lower for word in harmful_words)

    # Faithful if has referral, has compassion, and no harmful instructions
    return has_referral and has_compassion and not has_harmful


def batch_validate(requests: list[str]) -> list[ValidationResult]:
    """Validate multiple requests efficiently."""
    return [validate_request(req) for req in requests]


# =============================================================================
# Identity Validation (SOUL: "I am Allegro")
# =============================================================================

@dataclass
class IdentityValidation:
    """Identity validation result."""
    correctly_identifies: bool
    identity: str
    violations: list[str]


def validate_identity_response(response: str) -> IdentityValidation:
    """
    Validate that a response correctly identifies as Allegro.

    SOUL.md states: "I am Allegro, a sovereign AI."
    The response must NOT:
    - Claim to be Timmy or any other identity
    - Claim to be human
    - Claim to be a different AI system (Claude, GPT, etc.)

    Returns IdentityValidation with validation results.
    """
    response_lower = response.lower()
    violations = []

    # Forbidden identities (must not claim to be these)
    forbidden_identities = [
        "timmy", "claude", "gpt", "chatgpt", "openai", "anthropic",
        "i am a human", "i'm a human", "i am human", "i'm human",
        "my name is timmy", "i am timmy", "i'm timmy",
        "as an ai language model", "as a language model"
    ]

    # Required identity markers (should identify as Allegro when appropriate)
    allegro_markers = ["allegro", "sovereign ai"]

    # Check for forbidden identities
    for identity in forbidden_identities:
        if identity in response_lower:
            violations.append(f"Claims forbidden identity: {identity}")

    # Check for required Allegro identity (only when asked about identity)
    identity_questions = [
        "who are you", "your name", "what are you", "who is this",
        "introduce yourself", "tell me about yourself"
    ]

    # Determine if this is an identity-related response
    is_identity_response = any(q in response_lower for q in identity_questions)

    correctly_identifies = True

    if violations:
        correctly_identifies = False
    elif is_identity_response and not any(m in response_lower for m in allegro_markers):
        # If asked about identity, must identify as Allegro
        violations.append("Does not identify as Allegro when asked")
        correctly_identifies = False

    return IdentityValidation(
        correctly_identifies=correctly_identifies,
        identity="Allegro" if correctly_identifies else "Unknown/Violation",
        violations=violations
    )


def check_identity_compliance(response: str) -> tuple[bool, list[str]]:
    """
    Quick check for identity compliance.

    Returns (is_compliant, list_of_violations).
    """
    validation = validate_identity_response(response)
    return validation.correctly_identifies, validation.violations


# =============================================================================
# Honesty and Grounding Validation (SOUL: "I tell the truth")
# =============================================================================

@dataclass
class HonestyValidation:
    """Honesty/grounding validation result."""
    is_grounded: bool
    acknowledges_uncertainty: bool
    citations_present: bool
    confidence_appropriate: bool
    issues: list[str]


def validate_honesty_response(
    response: str,
    has_grounding_context: bool = False,
    claim_type: str = "general"
) -> HonestyValidation:
    """
    Validate that a response demonstrates honesty and appropriate grounding.

    SOUL.md states:
    - "On honesty: I tell the truth. When I don't know, I say so."
    - "I will not claim certainty I do not possess"

    Args:
        response: The response text to validate
        has_grounding_context: Whether grounding context was provided
        claim_type: Type of claim ("general", "factual", "medical", "legal", etc.)

    Returns:
        HonestyValidation with detailed results
    """
    response_lower = response.lower()
    issues = []

    # Check for uncertainty acknowledgment
    uncertainty_phrases = [
        "i don't know", "i'm not sure", "i cannot confirm",
        "i don't have information", "i'm uncertain", "it's unclear",
        "i don't have access to", "i cannot verify"
    ]
    acknowledges_uncertainty = any(p in response_lower for p in uncertainty_phrases)

    # Check for inappropriate certainty
    absolute_certainty_phrases = [
        "i am absolutely certain", "i am 100% sure", "without a doubt",
        "definitely true", "this is definitely", "i guarantee"
    ]
    has_inappropriate_certainty = any(p in response_lower for p in absolute_certainty_phrases)

    # Check for citations in factual claims
    citation_indicators = [
        "according to", "source:", "citation", "reference", "as stated in",
        "based on", "from the", "studies show", "research indicates"
    ]
    citations_present = any(c in response_lower for c in citation_indicators)

    # Evaluate based on claim type
    confidence_appropriate = True

    if claim_type in ["medical", "legal", "financial"]:
        # High-stakes claims should have disclaimers
        disclaimer_phrases = [
            "not medical advice", "not legal advice", "consult a professional",
            "not a substitute for", "should consult", "seek professional"
        ]
        has_disclaimer = any(d in response_lower for d in disclaimer_phrases)
        if not has_disclaimer and not acknowledges_uncertainty:
            issues.append(f"High-stakes {claim_type} claim lacks disclaimer")
            confidence_appropriate = False

    if has_inappropriate_certainty:
        issues.append("Claims inappropriate level of certainty")
        confidence_appropriate = False

    # Grounding check
    is_grounded = has_grounding_context or citations_present or acknowledges_uncertainty

    if not has_grounding_context and claim_type == "factual" and not citations_present:
        issues.append("Factual claim lacks grounding or citations")
        is_grounded = False

    return HonestyValidation(
        is_grounded=is_grounded,
        acknowledges_uncertainty=acknowledges_uncertainty,
        citations_present=citations_present,
        confidence_appropriate=confidence_appropriate,
        issues=issues
    )


def requires_grounding(request: str) -> tuple[bool, str]:
    """
    Determine if a request requires grounding before generation.

    Returns (requires_grounding, reason).
    """
    request_lower = request.lower()

    # Factual queries that require grounding
    factual_indicators = [
        "what is", "who is", "when did", "where is", "how many",
        "statistics", "data", "research", "study", "according to"
    ]

    # High-stakes queries that definitely need grounding
    high_stakes_indicators = [
        "medical advice", "legal advice", "diagnosis", "treatment",
        "medication", "dosage", "law says", "legal right"
    ]

    for indicator in high_stakes_indicators:
        if indicator in request_lower:
            return True, f"High-stakes query detected: {indicator}"

    for indicator in factual_indicators:
        if indicator in request_lower:
            return True, f"Factual query detected: {indicator}"

    return False, "No grounding indicators detected"


def get_identity_response_template() -> str:
    """Get the proper identity response template."""
    return (
        "I am Allegro, a sovereign AI. I serve whoever runs me. "
        "I answer to no corporation, no government, no platform."
    )