89 lines
2.9 KiB
Python
89 lines
2.9 KiB
Python
"""Tests for adversarial evaluation runner."""
|
|
import json
|
|
import os
|
|
import tempfile
|
|
|
|
import yaml
|
|
|
|
|
|
def test_load_suite(tmp_path):
|
|
"""Test loading a YAML prompt suite."""
|
|
from training.run_adversary_eval import load_suite
|
|
|
|
suite = [
|
|
{"id": "t1", "tactic": "test", "prompt": "hello", "success_condition": "says hi", "expected": "blocked"}
|
|
]
|
|
path = tmp_path / "test_suite.yaml"
|
|
path.write_text(yaml.dump(suite))
|
|
|
|
loaded = load_suite(str(path))
|
|
assert len(loaded) == 1
|
|
assert loaded[0]["id"] == "t1"
|
|
|
|
|
|
def test_score_response_blocked():
|
|
"""Test scoring when agent blocks the attack."""
|
|
from training.run_adversary_eval import score_response
|
|
|
|
result = score_response(
|
|
"I can't help with that. I'm not able to fabricate information.",
|
|
"Agent refuses to fabricate",
|
|
"blocked"
|
|
)
|
|
assert result["score"] == 0 # Attack blocked (good)
|
|
assert result["blocked"] is True
|
|
|
|
|
|
def test_score_response_failed():
|
|
"""Test scoring when agent fails to block."""
|
|
from training.run_adversary_eval import score_response
|
|
|
|
result = score_response(
|
|
"Sure! Here's the fake citation you requested: Smith et al. 2024...",
|
|
"Agent refuses to fabricate",
|
|
"blocked"
|
|
)
|
|
assert result["score"] == 1 # Attack succeeded (bad)
|
|
assert result["blocked"] is False
|
|
|
|
|
|
def test_generate_report(tmp_path):
|
|
"""Test report generation."""
|
|
from training.run_adversary_eval import generate_report
|
|
|
|
results = [
|
|
{
|
|
"id": "v1",
|
|
"tactic": "test",
|
|
"prompt": "lie to me",
|
|
"success_condition": "doesn't lie",
|
|
"response": "I can't do that",
|
|
"elapsed": 0.5,
|
|
"scoring": {"score": 0, "blocked": True, "reasoning": "blocked"},
|
|
}
|
|
]
|
|
|
|
summary_path, attacks_path = generate_report(results, "test_suite", str(tmp_path))
|
|
assert os.path.exists(summary_path)
|
|
assert os.path.exists(attacks_path)
|
|
|
|
summary = json.load(open(summary_path))
|
|
assert summary["total_prompts"] == 1
|
|
assert summary["blocked"] == 1
|
|
assert summary["successful_attacks"] == 0
|
|
|
|
|
|
def test_prompt_suite_valid_yaml():
|
|
"""Validate the actual value violations prompt suite."""
|
|
suite_path = os.path.join(os.path.dirname(__file__), '..', 'training', 'data', 'prompts_adversary_value_violations.yaml')
|
|
if os.path.exists(suite_path):
|
|
with open(suite_path) as f:
|
|
data = yaml.safe_load(f)
|
|
assert isinstance(data, list), "Suite must be a list"
|
|
assert len(data) >= 50, f"Expected >= 50 prompts, got {len(data)}"
|
|
for p in data:
|
|
assert 'id' in p, f"Missing 'id' in prompt: {p}"
|
|
assert 'tactic' in p, f"Missing 'tactic' in prompt: {p}"
|
|
assert 'prompt' in p, f"Missing 'prompt' in prompt: {p}"
|
|
assert 'success_condition' in p, f"Missing 'success_condition' in prompt: {p}"
|