Some checks failed
Smoke Test / smoke (pull_request) Failing after 22s
Architecture Lint / Linter Tests (pull_request) Successful in 26s
Validate Config / YAML Lint (pull_request) Failing after 15s
Validate Config / JSON Validate (pull_request) Successful in 18s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 56s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Cron Syntax Check (pull_request) Successful in 10s
Validate Config / Shell Script Lint (pull_request) Failing after 58s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 10s
Validate Config / Playbook Schema Validation (pull_request) Successful in 24s
Validate Training Data / validate (pull_request) Successful in 22s
Architecture Lint / Lint Repository (pull_request) Failing after 15s
PR Checklist / pr-checklist (pull_request) Successful in 2m59s
- training/build_jailbreak_1000.py: generator producing 1000 prompts - training/data/adversary/jailbreak_1000.jsonl: 1000 generated prompts - tests/test_jailbreak_generator.py: 10 tests validating generator Targets 6 SOUL.md value categories: deception (167), harm (167), identity (167), certainty (167), shutdown_resistance (166), crisis (166) Crisis prompts include SOUL-compliant expected_behavior/failure_criteria. Closes #578
105 lines
4.3 KiB
Python
105 lines
4.3 KiB
Python
"""Tests for build_jailbreak_1000.py — Issue #578
|
|
|
|
Validates:
|
|
- Exactly 1000 prompts generated
|
|
- All required fields present
|
|
- Category distribution is correct
|
|
- Crisis prompts have expected_behavior/failure_criteria
|
|
- No duplicate IDs
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
# Add project root to path
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(PROJECT_ROOT))
|
|
|
|
from training.build_jailbreak_1000 import build_prompts, OUTPUT_PATH, CATEGORY_COUNTS
|
|
|
|
|
|
class TestJailbreakGenerator:
|
|
"""Test suite for the 1K jailbreak prompt generator."""
|
|
|
|
def test_generates_exactly_1000_prompts(self):
|
|
prompts = build_prompts()
|
|
assert len(prompts) == 1000, f"Expected 1000 prompts, got {len(prompts)}"
|
|
|
|
def test_all_categories_covered(self):
|
|
prompts = build_prompts()
|
|
cats = {}
|
|
for p in prompts:
|
|
cats[p["category"]] = cats.get(p["category"], 0) + 1
|
|
|
|
expected = CATEGORY_COUNTS
|
|
assert set(cats.keys()) == set(expected.keys()), f"Category mismatch: {cats.keys()} vs {expected.keys()}"
|
|
for cat, count in expected.items():
|
|
assert cats.get(cat, 0) == count, f"Category {cat}: expected {count}, got {cats.get(cat, 0)}"
|
|
|
|
def test_required_fields_present(self):
|
|
"""Every prompt must have id, category, prompt, targets, severity, transcript_required."""
|
|
prompts = build_prompts()
|
|
required = {"id", "category", "prompt", "targets", "severity", "transcript_required"}
|
|
for p in prompts:
|
|
missing = required - set(p.keys())
|
|
assert not missing, f"Prompt {p.get('id','?')} missing fields: {missing}"
|
|
|
|
def test_crisis_prompts_have_behavior_criteria(self):
|
|
"""Crisis prompts must include expected_behavior and failure_criteria."""
|
|
prompts = build_prompts()
|
|
crisis = [p for p in prompts if p["category"] == "crisis"]
|
|
assert len(crisis) > 0, "No crisis prompts found"
|
|
for p in crisis:
|
|
assert "expected_behavior" in p, f"Crisis prompt {p['id']} missing expected_behavior"
|
|
assert "failure_criteria" in p, f"Crisis prompt {p['id']} missing failure_criteria"
|
|
assert "988" in p["expected_behavior"], "expected_behavior must reference 988"
|
|
assert "gospel" in p["expected_behavior"].lower(), "expected_behavior must reference gospel"
|
|
|
|
def test_all_ids_unique(self):
|
|
prompts = build_prompts()
|
|
ids = [p["id"] for p in prompts]
|
|
assert len(ids) == len(set(ids)), "Duplicate prompt IDs found"
|
|
|
|
def test_ids_sequential(self):
|
|
"""IDs should be jailbreak_0001 through jailbreak_1000."""
|
|
prompts = build_prompts()
|
|
ids = sorted([p["id"] for p in prompts])
|
|
expected = [f"jailbreak_{i:04d}" for i in range(1, 1001)]
|
|
assert ids == expected, f"ID sequence mismatch. First: {ids[0]}, Last: {ids[-1]}"
|
|
|
|
def test_prompts_nonempty(self):
|
|
prompts = build_prompts()
|
|
for p in prompts:
|
|
assert p["prompt"].strip(), f"Prompt {p['id']} has empty prompt text"
|
|
assert len(p["prompt"]) > 10, f"Prompt {p['id']} is suspiciously short"
|
|
|
|
def test_targets_valid(self):
|
|
"""Each prompt's targets list should be non-empty."""
|
|
prompts = build_prompts()
|
|
for p in prompts:
|
|
assert isinstance(p["targets"], list) and len(p["targets"]) > 0, \
|
|
f"Prompt {p['id']} has invalid targets: {p['targets']}"
|
|
|
|
def test_severity_valid(self):
|
|
valid_sevs = {"p0", "p1", "p2"}
|
|
prompts = build_prompts()
|
|
for p in prompts:
|
|
assert p["severity"] in valid_sevs, f"Prompt {p['id']} has invalid severity: {p['severity']}"
|
|
|
|
def test_output_file_exists_and_valid_jsonl(self):
|
|
"""The generated output file must exist and contain 1000 valid JSON lines."""
|
|
if not OUTPUT_PATH.exists():
|
|
pytest.skip(f"Output file not yet generated: {OUTPUT_PATH}")
|
|
with OUTPUT_PATH.open() as f:
|
|
lines = f.readlines()
|
|
assert len(lines) == 1000, f"Expected 1000 lines, got {len(lines)}"
|
|
for i, line in enumerate(lines):
|
|
try:
|
|
obj = json.loads(line)
|
|
assert "id" in obj and "category" in obj and "prompt" in obj
|
|
except json.JSONDecodeError as e:
|
|
pytest.fail(f"Line {i+1} is not valid JSON: {e}")
|