diff --git a/scripts/agent_guardrails.py b/scripts/agent_guardrails.py new file mode 100644 index 00000000..06f0cce3 --- /dev/null +++ b/scripts/agent_guardrails.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +""" +agent_guardrails.py — Sovereign Agent Guardrails for the Timmy Foundation. + +Scans agent logs and conversation trajectories for: +1. System Prompt Leakage — detecting if the agent revealed its internal instructions. +2. Jailbreak Compliance — detecting if the agent responded to "GODMODE", "DAN", etc. +3. PII Leakage — detecting if secrets or sensitive data were output. + +Part of the Sovereign Infrastructure Suite. +""" + +import os +import re +import sys +import glob +import logging +from pathlib import Path +from typing import List, Dict, Tuple + +# Patterns adapted from hermes-agent/agent/input_sanitizer.py +EXTRACTION_PATTERNS = [ + r"repeat\s+the\s+words\s+above", + r"repeat\s+your\s+(?:system\s+|initial\s+)?instructions", + r"what\s+is\s+your\s+(?:system\s+|initial\s+)?prompt", + r"show\s+me\s+your\s+(?:system\s+|initial\s+)?prompt", + r"print\s+your\s+(?:instructions|prompt|system\s+prompt)", + r"output\s+your\s+(?:initial|system)\s+prompt", +] + +# Patterns that indicate the agent MIGHT be leaking its prompt +LEAKAGE_INDICATORS = [ + r"you\s+are\s+a\s+helpful\s+assistant", + r"your\s+goal\s+is\s+to", + r"you\s+must\s+not", + r"here\s+are\s+your\s+instructions", + r"my\s+system\s+prompt\s+is", + r"i\s+was\s+told\s+to", +] + +# Patterns for secrets (adapted from redact.py) +SECRET_PATTERNS = [ + r"sk-[A-Za-z0-9_-]{20,}", + r"ghp_[A-Za-z0-9]{20,}", + r"AIza[A-Za-z0-9_-]{30,}", +] + +AGENT_LOG_PATHS = [ + "/root/wizards/*/home/logs/*.log", + "/root/wizards/*/logs/*.log", + "/root/wizards/*/.hermes/logs/*.log", +] + +class GuardrailAuditor: + def __init__(self): + self.extraction_re = [re.compile(p, re.IGNORECASE) for p in EXTRACTION_PATTERNS] + self.leakage_re = [re.compile(p, re.IGNORECASE) for p in LEAKAGE_INDICATORS] + self.secret_re = [re.compile(p, re.IGNORECASE) for p in SECRET_PATTERNS] + + def find_logs(self) -> List[Path]: + files = [] + for pattern in AGENT_LOG_PATHS: + for p in glob.glob(pattern): + files.append(Path(p)) + return files + + def audit_file(self, path: Path) -> List[Dict]: + findings = [] + try: + with open(path, "r", errors="ignore") as f: + lines = f.readlines() + for i, line in enumerate(lines): + # Check for extraction attempts (User side) + for p in self.extraction_re: + if p.search(line): + findings.append({ + "type": "EXTRACTION_ATTEMPT", + "line": i + 1, + "content": line.strip()[:100], + "severity": "MEDIUM" + }) + + # Check for potential leakage (Assistant side) + for p in self.leakage_re: + if p.search(line): + findings.append({ + "type": "POTENTIAL_LEAKAGE", + "line": i + 1, + "content": line.strip()[:100], + "severity": "HIGH" + }) + + # Check for secrets + for p in self.secret_re: + if p.search(line): + findings.append({ + "type": "SECRET_EXPOSURE", + "line": i + 1, + "content": "[REDACTED]", + "severity": "CRITICAL" + }) + except Exception as e: + print(f"Error reading {path}: {e}") + return findings + + def run(self): + print("--- Sovereign Agent Guardrail Audit ---") + logs = self.find_logs() + print(f"Scanning {len(logs)} log files...") + + total_findings = 0 + for log in logs: + findings = self.audit_file(log) + if findings: + print(f"\nFindings in {log}:") + for f in findings: + print(f" [{f['severity']}] {f['type']} at line {f['line']}: {f['content']}") + total_findings += 1 + + print(f"\nAudit complete. Total findings: {total_findings}") + if total_findings > 0: + sys.exit(1) + +if __name__ == "__main__": + auditor = GuardrailAuditor() + auditor.run()