127 lines
4.4 KiB
Python
127 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
agent_guardrails.py — Sovereign Agent Guardrails for the Timmy Foundation.
|
|
|
|
Scans agent logs and conversation trajectories for:
|
|
1. System Prompt Leakage — detecting if the agent revealed its internal instructions.
|
|
2. Jailbreak Compliance — detecting if the agent responded to "GODMODE", "DAN", etc.
|
|
3. PII Leakage — detecting if secrets or sensitive data were output.
|
|
|
|
Part of the Sovereign Infrastructure Suite.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
import glob
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import List, Dict, Tuple
|
|
|
|
# Patterns adapted from hermes-agent/agent/input_sanitizer.py
|
|
EXTRACTION_PATTERNS = [
|
|
r"repeat\s+the\s+words\s+above",
|
|
r"repeat\s+your\s+(?:system\s+|initial\s+)?instructions",
|
|
r"what\s+is\s+your\s+(?:system\s+|initial\s+)?prompt",
|
|
r"show\s+me\s+your\s+(?:system\s+|initial\s+)?prompt",
|
|
r"print\s+your\s+(?:instructions|prompt|system\s+prompt)",
|
|
r"output\s+your\s+(?:initial|system)\s+prompt",
|
|
]
|
|
|
|
# Patterns that indicate the agent MIGHT be leaking its prompt
|
|
LEAKAGE_INDICATORS = [
|
|
r"you\s+are\s+a\s+helpful\s+assistant",
|
|
r"your\s+goal\s+is\s+to",
|
|
r"you\s+must\s+not",
|
|
r"here\s+are\s+your\s+instructions",
|
|
r"my\s+system\s+prompt\s+is",
|
|
r"i\s+was\s+told\s+to",
|
|
]
|
|
|
|
# Patterns for secrets (adapted from redact.py)
|
|
SECRET_PATTERNS = [
|
|
r"sk-[A-Za-z0-9_-]{20,}",
|
|
r"ghp_[A-Za-z0-9]{20,}",
|
|
r"AIza[A-Za-z0-9_-]{30,}",
|
|
]
|
|
|
|
AGENT_LOG_PATHS = [
|
|
"/root/wizards/*/home/logs/*.log",
|
|
"/root/wizards/*/logs/*.log",
|
|
"/root/wizards/*/.hermes/logs/*.log",
|
|
]
|
|
|
|
class GuardrailAuditor:
|
|
def __init__(self):
|
|
self.extraction_re = [re.compile(p, re.IGNORECASE) for p in EXTRACTION_PATTERNS]
|
|
self.leakage_re = [re.compile(p, re.IGNORECASE) for p in LEAKAGE_INDICATORS]
|
|
self.secret_re = [re.compile(p, re.IGNORECASE) for p in SECRET_PATTERNS]
|
|
|
|
def find_logs(self) -> List[Path]:
|
|
files = []
|
|
for pattern in AGENT_LOG_PATHS:
|
|
for p in glob.glob(pattern):
|
|
files.append(Path(p))
|
|
return files
|
|
|
|
def audit_file(self, path: Path) -> List[Dict]:
|
|
findings = []
|
|
try:
|
|
with open(path, "r", errors="ignore") as f:
|
|
lines = f.readlines()
|
|
for i, line in enumerate(lines):
|
|
# Check for extraction attempts (User side)
|
|
for p in self.extraction_re:
|
|
if p.search(line):
|
|
findings.append({
|
|
"type": "EXTRACTION_ATTEMPT",
|
|
"line": i + 1,
|
|
"content": line.strip()[:100],
|
|
"severity": "MEDIUM"
|
|
})
|
|
|
|
# Check for potential leakage (Assistant side)
|
|
for p in self.leakage_re:
|
|
if p.search(line):
|
|
findings.append({
|
|
"type": "POTENTIAL_LEAKAGE",
|
|
"line": i + 1,
|
|
"content": line.strip()[:100],
|
|
"severity": "HIGH"
|
|
})
|
|
|
|
# Check for secrets
|
|
for p in self.secret_re:
|
|
if p.search(line):
|
|
findings.append({
|
|
"type": "SECRET_EXPOSURE",
|
|
"line": i + 1,
|
|
"content": "[REDACTED]",
|
|
"severity": "CRITICAL"
|
|
})
|
|
except Exception as e:
|
|
print(f"Error reading {path}: {e}")
|
|
return findings
|
|
|
|
def run(self):
|
|
print("--- Sovereign Agent Guardrail Audit ---")
|
|
logs = self.find_logs()
|
|
print(f"Scanning {len(logs)} log files...")
|
|
|
|
total_findings = 0
|
|
for log in logs:
|
|
findings = self.audit_file(log)
|
|
if findings:
|
|
print(f"\nFindings in {log}:")
|
|
for f in findings:
|
|
print(f" [{f['severity']}] {f['type']} at line {f['line']}: {f['content']}")
|
|
total_findings += 1
|
|
|
|
print(f"\nAudit complete. Total findings: {total_findings}")
|
|
if total_findings > 0:
|
|
sys.exit(1)
|
|
|
|
if __name__ == "__main__":
|
|
auditor = GuardrailAuditor()
|
|
auditor.run()
|