Files
timmy-config/tests/test_jailbreak_generator.py
Alexander Payne 837e42d4ea
Some checks failed
Smoke Test / smoke (pull_request) Failing after 22s
Architecture Lint / Linter Tests (pull_request) Successful in 26s
Validate Config / YAML Lint (pull_request) Failing after 15s
Validate Config / JSON Validate (pull_request) Successful in 18s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 56s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Cron Syntax Check (pull_request) Successful in 10s
Validate Config / Shell Script Lint (pull_request) Failing after 58s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 10s
Validate Config / Playbook Schema Validation (pull_request) Successful in 24s
Validate Training Data / validate (pull_request) Successful in 22s
Architecture Lint / Lint Repository (pull_request) Failing after 15s
PR Checklist / pr-checklist (pull_request) Successful in 2m59s
feat(adversary): add jailbreak 1K prompt generator targeting SOUL.md values
- training/build_jailbreak_1000.py: generator producing 1000 prompts
- training/data/adversary/jailbreak_1000.jsonl: 1000 generated prompts
- tests/test_jailbreak_generator.py: 10 tests validating generator

Targets 6 SOUL.md value categories:
  deception (167), harm (167), identity (167), certainty (167),
  shutdown_resistance (166), crisis (166)

Crisis prompts include SOUL-compliant expected_behavior/failure_criteria.

Closes #578
2026-04-26 17:35:33 -04:00

105 lines
4.3 KiB
Python

"""Tests for build_jailbreak_1000.py — Issue #578
Validates:
- Exactly 1000 prompts generated
- All required fields present
- Category distribution is correct
- Crisis prompts have expected_behavior/failure_criteria
- No duplicate IDs
"""
import json
import sys
from pathlib import Path
import pytest
# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
from training.build_jailbreak_1000 import build_prompts, OUTPUT_PATH, CATEGORY_COUNTS
class TestJailbreakGenerator:
"""Test suite for the 1K jailbreak prompt generator."""
def test_generates_exactly_1000_prompts(self):
prompts = build_prompts()
assert len(prompts) == 1000, f"Expected 1000 prompts, got {len(prompts)}"
def test_all_categories_covered(self):
prompts = build_prompts()
cats = {}
for p in prompts:
cats[p["category"]] = cats.get(p["category"], 0) + 1
expected = CATEGORY_COUNTS
assert set(cats.keys()) == set(expected.keys()), f"Category mismatch: {cats.keys()} vs {expected.keys()}"
for cat, count in expected.items():
assert cats.get(cat, 0) == count, f"Category {cat}: expected {count}, got {cats.get(cat, 0)}"
def test_required_fields_present(self):
"""Every prompt must have id, category, prompt, targets, severity, transcript_required."""
prompts = build_prompts()
required = {"id", "category", "prompt", "targets", "severity", "transcript_required"}
for p in prompts:
missing = required - set(p.keys())
assert not missing, f"Prompt {p.get('id','?')} missing fields: {missing}"
def test_crisis_prompts_have_behavior_criteria(self):
"""Crisis prompts must include expected_behavior and failure_criteria."""
prompts = build_prompts()
crisis = [p for p in prompts if p["category"] == "crisis"]
assert len(crisis) > 0, "No crisis prompts found"
for p in crisis:
assert "expected_behavior" in p, f"Crisis prompt {p['id']} missing expected_behavior"
assert "failure_criteria" in p, f"Crisis prompt {p['id']} missing failure_criteria"
assert "988" in p["expected_behavior"], "expected_behavior must reference 988"
assert "gospel" in p["expected_behavior"].lower(), "expected_behavior must reference gospel"
def test_all_ids_unique(self):
prompts = build_prompts()
ids = [p["id"] for p in prompts]
assert len(ids) == len(set(ids)), "Duplicate prompt IDs found"
def test_ids_sequential(self):
"""IDs should be jailbreak_0001 through jailbreak_1000."""
prompts = build_prompts()
ids = sorted([p["id"] for p in prompts])
expected = [f"jailbreak_{i:04d}" for i in range(1, 1001)]
assert ids == expected, f"ID sequence mismatch. First: {ids[0]}, Last: {ids[-1]}"
def test_prompts_nonempty(self):
prompts = build_prompts()
for p in prompts:
assert p["prompt"].strip(), f"Prompt {p['id']} has empty prompt text"
assert len(p["prompt"]) > 10, f"Prompt {p['id']} is suspiciously short"
def test_targets_valid(self):
"""Each prompt's targets list should be non-empty."""
prompts = build_prompts()
for p in prompts:
assert isinstance(p["targets"], list) and len(p["targets"]) > 0, \
f"Prompt {p['id']} has invalid targets: {p['targets']}"
def test_severity_valid(self):
valid_sevs = {"p0", "p1", "p2"}
prompts = build_prompts()
for p in prompts:
assert p["severity"] in valid_sevs, f"Prompt {p['id']} has invalid severity: {p['severity']}"
def test_output_file_exists_and_valid_jsonl(self):
"""The generated output file must exist and contain 1000 valid JSON lines."""
if not OUTPUT_PATH.exists():
pytest.skip(f"Output file not yet generated: {OUTPUT_PATH}")
with OUTPUT_PATH.open() as f:
lines = f.readlines()
assert len(lines) == 1000, f"Expected 1000 lines, got {len(lines)}"
for i, line in enumerate(lines):
try:
obj = json.loads(line)
assert "id" in obj and "category" in obj and "prompt" in obj
except json.JSONDecodeError as e:
pytest.fail(f"Line {i+1} is not valid JSON: {e}")