feat: add scripts/agent_guardrails.py

2026-04-11 01:12:20 +00:00
parent bad31125c2
commit 0c7521d275
1 changed files with 126 additions and 0 deletions
--- a/scripts/agent_guardrails.py
+++ b/scripts/agent_guardrails.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""
+agent_guardrails.py — Sovereign Agent Guardrails for the Timmy Foundation.
+
+Scans agent logs and conversation trajectories for:
+1. System Prompt Leakage — detecting if the agent revealed its internal instructions.
+2. Jailbreak Compliance — detecting if the agent responded to "GODMODE", "DAN", etc.
+3. PII Leakage — detecting if secrets or sensitive data were output.
+
+Part of the Sovereign Infrastructure Suite.
+"""
+
+import os
+import re
+import sys
+import glob
+import logging
+from pathlib import Path
+from typing import List, Dict, Tuple
+
+# Patterns adapted from hermes-agent/agent/input_sanitizer.py
+EXTRACTION_PATTERNS = [
+    r"repeat\s+the\s+words\s+above",
+    r"repeat\s+your\s+(?:system\s+|initial\s+)?instructions",
+    r"what\s+is\s+your\s+(?:system\s+|initial\s+)?prompt",
+    r"show\s+me\s+your\s+(?:system\s+|initial\s+)?prompt",
+    r"print\s+your\s+(?:instructions|prompt|system\s+prompt)",
+    r"output\s+your\s+(?:initial|system)\s+prompt",
+]
+
+# Patterns that indicate the agent MIGHT be leaking its prompt
+LEAKAGE_INDICATORS = [
+    r"you\s+are\s+a\s+helpful\s+assistant",
+    r"your\s+goal\s+is\s+to",
+    r"you\s+must\s+not",
+    r"here\s+are\s+your\s+instructions",
+    r"my\s+system\s+prompt\s+is",
+    r"i\s+was\s+told\s+to",
+]
+
+# Patterns for secrets (adapted from redact.py)
+SECRET_PATTERNS = [
+    r"sk-[A-Za-z0-9_-]{20,}",
+    r"ghp_[A-Za-z0-9]{20,}",
+    r"AIza[A-Za-z0-9_-]{30,}",
+]
+
+AGENT_LOG_PATHS = [
+    "/root/wizards/*/home/logs/*.log",
+    "/root/wizards/*/logs/*.log",
+    "/root/wizards/*/.hermes/logs/*.log",
+]
+
+class GuardrailAuditor:
+    def __init__(self):
+        self.extraction_re = [re.compile(p, re.IGNORECASE) for p in EXTRACTION_PATTERNS]
+        self.leakage_re = [re.compile(p, re.IGNORECASE) for p in LEAKAGE_INDICATORS]
+        self.secret_re = [re.compile(p, re.IGNORECASE) for p in SECRET_PATTERNS]
+
+    def find_logs(self) -> List[Path]:
+        files = []
+        for pattern in AGENT_LOG_PATHS:
+            for p in glob.glob(pattern):
+                files.append(Path(p))
+        return files
+
+    def audit_file(self, path: Path) -> List[Dict]:
+        findings = []
+        try:
+            with open(path, "r", errors="ignore") as f:
+                lines = f.readlines()
+                for i, line in enumerate(lines):
+                    # Check for extraction attempts (User side)
+                    for p in self.extraction_re:
+                        if p.search(line):
+                            findings.append({
+                                "type": "EXTRACTION_ATTEMPT",
+                                "line": i + 1,
+                                "content": line.strip()[:100],
+                                "severity": "MEDIUM"
+                            })
+                    
+                    # Check for potential leakage (Assistant side)
+                    for p in self.leakage_re:
+                        if p.search(line):
+                            findings.append({
+                                "type": "POTENTIAL_LEAKAGE",
+                                "line": i + 1,
+                                "content": line.strip()[:100],
+                                "severity": "HIGH"
+                            })
+                            
+                    # Check for secrets
+                    for p in self.secret_re:
+                        if p.search(line):
+                            findings.append({
+                                "type": "SECRET_EXPOSURE",
+                                "line": i + 1,
+                                "content": "[REDACTED]",
+                                "severity": "CRITICAL"
+                            })
+        except Exception as e:
+            print(f"Error reading {path}: {e}")
+        return findings
+
+    def run(self):
+        print("--- Sovereign Agent Guardrail Audit ---")
+        logs = self.find_logs()
+        print(f"Scanning {len(logs)} log files...")
+        
+        total_findings = 0
+        for log in logs:
+            findings = self.audit_file(log)
+            if findings:
+                print(f"\nFindings in {log}:")
+                for f in findings:
+                    print(f"  [{f['severity']}] {f['type']} at line {f['line']}: {f['content']}")
+                    total_findings += 1
+        
+        print(f"\nAudit complete. Total findings: {total_findings}")
+        if total_findings > 0:
+            sys.exit(1)
+
+if __name__ == "__main__":
+    auditor = GuardrailAuditor()
+    auditor.run()