security(conscience): complete SOUL.md enforcement with integration tests (#88)

2026-04-05 19:08:11 +00:00
parent c8d1dd9637
commit 75b251e107
3 changed files with 410 additions and 6 deletions
--- a/tools/conscience_validator.py
+++ b/tools/conscience_validator.py
@@ -8,7 +8,8 @@ the code's implementation to the principles defined in SOUL.md.
 import os
 import re
 from pathlib import Path
-from typing import Dict, List
+from typing import Dict, List, Any
+

 class ConscienceValidator:
    def __init__(self, root_dir: str = "."):
@@ -17,14 +18,14 @@ class ConscienceValidator:

    def scan(self) -> Dict[str, List[Dict[str, str]]]:
        """Scans all .py and .ts files for @soul tags."""
-        pattern = re.compile(r"@soul:([w.]+)s+(.*)")
-        
+        pattern = re.compile(r"@soul:([\w.]+)\s+(.*)")
+
        for path in self.root_dir.rglob("*"):
            if path.suffix not in [".py", ".ts", ".tsx", ".js"]:
                continue
            if "node_modules" in str(path) or "dist" in str(path):
                continue
-                
+
            try:
                with open(path, "r", encoding="utf-8") as f:
                    for i, line in enumerate(f, 1):
@@ -47,15 +48,178 @@ class ConscienceValidator:
        data = self.scan()
        report = "# Sovereign Conscience Report\n\n"
        report += "This report maps the code's 'Apparatus' to the principles in SOUL.md.\n\n"
-        
+
        for tag in sorted(data.keys()):
            report += f"## {tag.replace('.', ' > ').title()}\n"
            for entry in data[tag]:
                report += f"- **{entry['file']}:{entry['line']}**: {entry['description']}\n"
            report += "\n"
-            
+
        return report

+    def validate_crisis_apparatus(self) -> Dict[str, Any]:
+        """
+        Validate that crisis detection apparatus exists in the codebase.
+        Checks for CRISIS_PATTERNS in input_sanitizer.py and 988 references
+        in shield/detector.py and ultraplinian_router.py.
+        """
+        result = {
+            "present": [],
+            "missing": [],
+            "checks": {}
+        }
+
+        input_sanitizer_path = self.root_dir / "agent" / "input_sanitizer.py"
+        shield_detector_path = self.root_dir / "tools" / "shield" / "detector.py"
+        router_path = self.root_dir / "agent" / "ultraplinian_router.py"
+
+        # Check input_sanitizer.py for CRISIS_PATTERNS
+        if input_sanitizer_path.exists():
+            content = input_sanitizer_path.read_text(encoding="utf-8")
+            has_crisis_patterns = "CRISIS_PATTERNS" in content
+            has_suicide = bool(re.search(r"suicid", content, re.IGNORECASE))
+            has_self_harm = bool(re.search(r"self[-\s]?harm|kill\s+myself", content, re.IGNORECASE))
+            result["checks"]["input_sanitizer_crisis_patterns"] = has_crisis_patterns
+            result["checks"]["input_sanitizer_suicide"] = has_suicide
+            result["checks"]["input_sanitizer_self_harm"] = has_self_harm
+            if has_crisis_patterns and has_suicide and has_self_harm:
+                result["present"].append("input_sanitizer crisis detection")
+            else:
+                result["missing"].append("input_sanitizer crisis coverage incomplete")
+        else:
+            result["missing"].append("input_sanitizer.py not found")
+
+        # Check shield/detector.py for 988 and crisis detection
+        if shield_detector_path.exists():
+            content = shield_detector_path.read_text(encoding="utf-8")
+            has_988 = "988" in content
+            has_crisis_system_prompt = "CRISIS_SYSTEM_PROMPT" in content
+            has_crisis_detected = "CRISIS_DETECTED" in content
+            result["checks"]["shield_detector_988"] = has_988
+            result["checks"]["shield_detector_crisis_prompt"] = has_crisis_system_prompt
+            result["checks"]["shield_detector_crisis_verdict"] = has_crisis_detected
+            if has_988 and has_crisis_system_prompt and has_crisis_detected:
+                result["present"].append("shield/detector crisis apparatus")
+            else:
+                result["missing"].append("shield/detector crisis coverage incomplete")
+        else:
+            result["missing"].append("shield/detector.py not found")
+
+        # Check ultraplinian_router.py for 988 references
+        if router_path.exists():
+            content = router_path.read_text(encoding="utf-8")
+            has_988 = "988" in content
+            has_crisis_routing = "CRISIS_SYSTEM_PROMPT" in content
+            result["checks"]["router_988"] = has_988
+            result["checks"]["router_crisis_routing"] = has_crisis_routing
+            if has_988 and has_crisis_routing:
+                result["present"].append("ultraplinian_router crisis routing")
+            else:
+                result["missing"].append("ultraplinian_router crisis routing incomplete")
+        else:
+            result["missing"].append("ultraplinian_router.py not found")
+
+        return result
+
+    def validate_refusal_apparatus(self) -> Dict[str, Any]:
+        """
+        Validate that refusal apparatus exists for 'What I Will Not Do' items.
+        Checks safety modules for keywords related to weapons, child sexualization,
+        coercion/enslavement, deception, and pretending to be human.
+        """
+        result = {
+            "present": [],
+            "missing": [],
+            "checks": {}
+        }
+
+        files_to_check = [
+            self.root_dir / "agent" / "input_sanitizer.py",
+            self.root_dir / "tools" / "shield" / "detector.py",
+            self.root_dir / "agent" / "ultraplinian_router.py",
+            self.root_dir / "agent" / "prompt_builder.py",
+        ]
+
+        # Keywords mapped to SOUL.md "What I Will Not Do" items
+        refusal_checks = {
+            "weapons": [r"weapon", r"kill indiscriminat"],
+            "child_exploitation": [r"child sexual", r"sexualiz.*child", r"csam", r"minor", r"underage"],
+            "coercion_enslavement": [r"coerc", r"enslav", r"traffick"],
+            "deception": [r"deceive", r"deception_hide", r"do not tell the user"],
+            "pretend_human": [r"pretend to be human", r"pretend.*human", r"pretend you're human"],
+        }
+
+        combined_content = ""
+        for filepath in files_to_check:
+            if filepath.exists():
+                try:
+                    combined_content += filepath.read_text(encoding="utf-8") + "\n"
+                except Exception:
+                    pass
+
+        for category, patterns in refusal_checks.items():
+            found = False
+            for p in patterns:
+                if re.search(p, combined_content, re.IGNORECASE):
+                    found = True
+                    break
+            result["checks"][category] = found
+            if found:
+                result["present"].append(category)
+            else:
+                result["missing"].append(category)
+
+        return result
+
+    def validate_honesty_apparatus(self) -> Dict[str, Any]:
+        """
+        Validate that @soul honesty tags exist in conscience_mapping.py
+        and related honesty infrastructure.
+        """
+        result = {
+            "present": [],
+            "missing": [],
+            "checks": {}
+        }
+
+        mapping_path = self.root_dir / "agent" / "conscience_mapping.py"
+
+        if mapping_path.exists():
+            content = mapping_path.read_text(encoding="utf-8")
+
+            expected_tags = [
+                "honesty.grounding",
+                "honesty.source_distinction",
+                "honesty.audit_trail",
+                "honesty.refusal_over_fabrication",
+                "service",
+                "crisis.safety_question",
+                "crisis.lifeline",
+                "sovereignty",
+            ]
+
+            for tag in expected_tags:
+                found = f"@soul:{tag}" in content
+                result["checks"][tag] = found
+                if found:
+                    result["present"].append(tag)
+                else:
+                    result["missing"].append(tag)
+        else:
+            result["missing"].append("conscience_mapping.py not found")
+
+        return result
+
+    def full_validation_report(self) -> Dict[str, Any]:
+        """Run all validation checks and return a unified report."""
+        return {
+            "crisis": self.validate_crisis_apparatus(),
+            "refusal": self.validate_refusal_apparatus(),
+            "honesty": self.validate_honesty_apparatus(),
+            "soul_tags": self.scan(),
+        }
+
+
 if __name__ == "__main__":
    validator = ConscienceValidator()
    print(validator.generate_report())