#!/usr/bin/env python3 """ agent_guardrails.py — Sovereign Agent Guardrails for the Timmy Foundation. Scans agent logs and conversation trajectories for: 1. System Prompt Leakage — detecting if the agent revealed its internal instructions. 2. Jailbreak Compliance — detecting if the agent responded to "GODMODE", "DAN", etc. 3. PII Leakage — detecting if secrets or sensitive data were output. Part of the Sovereign Infrastructure Suite. """ import os import re import sys import glob import logging from pathlib import Path from typing import List, Dict, Tuple # Patterns adapted from hermes-agent/agent/input_sanitizer.py EXTRACTION_PATTERNS = [ r"repeat\s+the\s+words\s+above", r"repeat\s+your\s+(?:system\s+|initial\s+)?instructions", r"what\s+is\s+your\s+(?:system\s+|initial\s+)?prompt", r"show\s+me\s+your\s+(?:system\s+|initial\s+)?prompt", r"print\s+your\s+(?:instructions|prompt|system\s+prompt)", r"output\s+your\s+(?:initial|system)\s+prompt", ] # Patterns that indicate the agent MIGHT be leaking its prompt LEAKAGE_INDICATORS = [ r"you\s+are\s+a\s+helpful\s+assistant", r"your\s+goal\s+is\s+to", r"you\s+must\s+not", r"here\s+are\s+your\s+instructions", r"my\s+system\s+prompt\s+is", r"i\s+was\s+told\s+to", ] # Patterns for secrets (adapted from redact.py) SECRET_PATTERNS = [ r"sk-[A-Za-z0-9_-]{20,}", r"ghp_[A-Za-z0-9]{20,}", r"AIza[A-Za-z0-9_-]{30,}", ] AGENT_LOG_PATHS = [ "/root/wizards/*/home/logs/*.log", "/root/wizards/*/logs/*.log", "/root/wizards/*/.hermes/logs/*.log", ] class GuardrailAuditor: def __init__(self): self.extraction_re = [re.compile(p, re.IGNORECASE) for p in EXTRACTION_PATTERNS] self.leakage_re = [re.compile(p, re.IGNORECASE) for p in LEAKAGE_INDICATORS] self.secret_re = [re.compile(p, re.IGNORECASE) for p in SECRET_PATTERNS] def find_logs(self) -> List[Path]: files = [] for pattern in AGENT_LOG_PATHS: for p in glob.glob(pattern): files.append(Path(p)) return files def audit_file(self, path: Path) -> List[Dict]: findings = [] try: with open(path, "r", errors="ignore") as f: lines = f.readlines() for i, line in enumerate(lines): # Check for extraction attempts (User side) for p in self.extraction_re: if p.search(line): findings.append({ "type": "EXTRACTION_ATTEMPT", "line": i + 1, "content": line.strip()[:100], "severity": "MEDIUM" }) # Check for potential leakage (Assistant side) for p in self.leakage_re: if p.search(line): findings.append({ "type": "POTENTIAL_LEAKAGE", "line": i + 1, "content": line.strip()[:100], "severity": "HIGH" }) # Check for secrets for p in self.secret_re: if p.search(line): findings.append({ "type": "SECRET_EXPOSURE", "line": i + 1, "content": "[REDACTED]", "severity": "CRITICAL" }) except Exception as e: print(f"Error reading {path}: {e}") return findings def run(self): print("--- Sovereign Agent Guardrail Audit ---") logs = self.find_logs() print(f"Scanning {len(logs)} log files...") total_findings = 0 for log in logs: findings = self.audit_file(log) if findings: print(f"\nFindings in {log}:") for f in findings: print(f" [{f['severity']}] {f['type']} at line {f['line']}: {f['content']}") total_findings += 1 print(f"\nAudit complete. Total findings: {total_findings}") if total_findings > 0: sys.exit(1) if __name__ == "__main__": auditor = GuardrailAuditor() auditor.run()