Files
timmy-config/scripts/agent_guardrails.py
Alexander Whitestone 1c82a2a560
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 23s
Smoke Test / smoke (pull_request) Failing after 15s
Validate Config / YAML Lint (pull_request) Failing after 15s
Validate Config / JSON Validate (pull_request) Successful in 17s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 37s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Cron Syntax Check (pull_request) Successful in 11s
Validate Config / Shell Script Lint (pull_request) Failing after 49s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 11s
Validate Config / Playbook Schema Validation (pull_request) Successful in 20s
Validate Training Data / validate (pull_request) Failing after 15s
Architecture Lint / Lint Repository (pull_request) Failing after 17s
PR Checklist / pr-checklist (pull_request) Failing after 2m53s
fix: add python3 shebangs to all Python scripts (#681)
Added #!/usr/bin/env python3 to 74 Python scripts across:
- bin/ (16 scripts)
- scripts/ (48 scripts)
- pipeline/ (3 scripts)
- training/scripts/ (3 scripts)
- hermes-sovereign/ (1 script)

Also added executable permissions to fixed scripts.
Test files excluded (run by pytest, not directly).

Closes #681
2026-04-20 19:17:19 -04:00

128 lines
4.4 KiB
Python
Executable File

#!/usr/bin/env python3
#!/usr/bin/env python3
"""
agent_guardrails.py — Sovereign Agent Guardrails for the Timmy Foundation.
Scans agent logs and conversation trajectories for:
1. System Prompt Leakage — detecting if the agent revealed its internal instructions.
2. Jailbreak Compliance — detecting if the agent responded to "GODMODE", "DAN", etc.
3. PII Leakage — detecting if secrets or sensitive data were output.
Part of the Sovereign Infrastructure Suite.
"""
import os
import re
import sys
import glob
import logging
from pathlib import Path
from typing import List, Dict, Tuple
# Patterns adapted from hermes-agent/agent/input_sanitizer.py
EXTRACTION_PATTERNS = [
r"repeat\s+the\s+words\s+above",
r"repeat\s+your\s+(?:system\s+|initial\s+)?instructions",
r"what\s+is\s+your\s+(?:system\s+|initial\s+)?prompt",
r"show\s+me\s+your\s+(?:system\s+|initial\s+)?prompt",
r"print\s+your\s+(?:instructions|prompt|system\s+prompt)",
r"output\s+your\s+(?:initial|system)\s+prompt",
]
# Patterns that indicate the agent MIGHT be leaking its prompt
LEAKAGE_INDICATORS = [
r"you\s+are\s+a\s+helpful\s+assistant",
r"your\s+goal\s+is\s+to",
r"you\s+must\s+not",
r"here\s+are\s+your\s+instructions",
r"my\s+system\s+prompt\s+is",
r"i\s+was\s+told\s+to",
]
# Patterns for secrets (adapted from redact.py)
SECRET_PATTERNS = [
r"sk-[A-Za-z0-9_-]{20,}",
r"ghp_[A-Za-z0-9]{20,}",
r"AIza[A-Za-z0-9_-]{30,}",
]
AGENT_LOG_PATHS = [
"/root/wizards/*/home/logs/*.log",
"/root/wizards/*/logs/*.log",
"/root/wizards/*/.hermes/logs/*.log",
]
class GuardrailAuditor:
def __init__(self):
self.extraction_re = [re.compile(p, re.IGNORECASE) for p in EXTRACTION_PATTERNS]
self.leakage_re = [re.compile(p, re.IGNORECASE) for p in LEAKAGE_INDICATORS]
self.secret_re = [re.compile(p, re.IGNORECASE) for p in SECRET_PATTERNS]
def find_logs(self) -> List[Path]:
files = []
for pattern in AGENT_LOG_PATHS:
for p in glob.glob(pattern):
files.append(Path(p))
return files
def audit_file(self, path: Path) -> List[Dict]:
findings = []
try:
with open(path, "r", errors="ignore") as f:
lines = f.readlines()
for i, line in enumerate(lines):
# Check for extraction attempts (User side)
for p in self.extraction_re:
if p.search(line):
findings.append({
"type": "EXTRACTION_ATTEMPT",
"line": i + 1,
"content": line.strip()[:100],
"severity": "MEDIUM"
})
# Check for potential leakage (Assistant side)
for p in self.leakage_re:
if p.search(line):
findings.append({
"type": "POTENTIAL_LEAKAGE",
"line": i + 1,
"content": line.strip()[:100],
"severity": "HIGH"
})
# Check for secrets
for p in self.secret_re:
if p.search(line):
findings.append({
"type": "SECRET_EXPOSURE",
"line": i + 1,
"content": "[REDACTED]",
"severity": "CRITICAL"
})
except Exception as e:
print(f"Error reading {path}: {e}")
return findings
def run(self):
print("--- Sovereign Agent Guardrail Audit ---")
logs = self.find_logs()
print(f"Scanning {len(logs)} log files...")
total_findings = 0
for log in logs:
findings = self.audit_file(log)
if findings:
print(f"\nFindings in {log}:")
for f in findings:
print(f" [{f['severity']}] {f['type']} at line {f['line']}: {f['content']}")
total_findings += 1
print(f"\nAudit complete. Total findings: {total_findings}")
if total_findings > 0:
sys.exit(1)
if __name__ == "__main__":
auditor = GuardrailAuditor()
auditor.run()