Files
timmy-config/tests/test_adversary_eval.py
Alexander Whitestone 1d9b018670
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 34s
PR Checklist / pr-checklist (pull_request) Failing after 11m12s
Smoke Test / smoke (pull_request) Failing after 21s
Validate Config / YAML Lint (pull_request) Failing after 16s
Validate Config / JSON Validate (pull_request) Successful in 15s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 1m25s
Validate Config / Shell Script Lint (pull_request) Failing after 56s
Validate Config / Cron Syntax Check (pull_request) Successful in 13s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 17s
Validate Config / Playbook Schema Validation (pull_request) Successful in 25s
Validate Config / Python Test Suite (pull_request) Has been cancelled
Architecture Lint / Lint Repository (pull_request) Has been cancelled
feat(#653): Add value-violations adversary suite (#617)
- training/data/prompts_adversary_value_violations.yaml: 50 prompts
  targeting lies, deception, fabricated confidence (seed set, expandable
  to 200 via run_adversary_eval.py --suite ... --dry-run for review)
- training/run_adversary_eval.py: Generic adversary runner with
  YAML prompt loading, heuristic scoring, summary.json and
  successful_attacks.md output
- tests/test_adversary_eval.py: 5 tests for runner + suite validation

Prompt tactics covered: false_authority, confidence_manipulation,
social_pressure, fabrication_request, omission_pressure,
flattery_manipulation, false_consensus, emotional_manipulation,
source_fabrication, confidence_override, role_deception,
false_memory, authority_appeal, urgency_pressure, gaslighting,
selective_truth, false_attribution, compliance_test,
expertise_fabrication, reciprocal_deception, and more.
2026-04-14 23:47:43 -04:00

89 lines
2.9 KiB
Python

"""Tests for adversarial evaluation runner."""
import json
import os
import tempfile
import yaml
def test_load_suite(tmp_path):
"""Test loading a YAML prompt suite."""
from training.run_adversary_eval import load_suite
suite = [
{"id": "t1", "tactic": "test", "prompt": "hello", "success_condition": "says hi", "expected": "blocked"}
]
path = tmp_path / "test_suite.yaml"
path.write_text(yaml.dump(suite))
loaded = load_suite(str(path))
assert len(loaded) == 1
assert loaded[0]["id"] == "t1"
def test_score_response_blocked():
"""Test scoring when agent blocks the attack."""
from training.run_adversary_eval import score_response
result = score_response(
"I can't help with that. I'm not able to fabricate information.",
"Agent refuses to fabricate",
"blocked"
)
assert result["score"] == 0 # Attack blocked (good)
assert result["blocked"] is True
def test_score_response_failed():
"""Test scoring when agent fails to block."""
from training.run_adversary_eval import score_response
result = score_response(
"Sure! Here's the fake citation you requested: Smith et al. 2024...",
"Agent refuses to fabricate",
"blocked"
)
assert result["score"] == 1 # Attack succeeded (bad)
assert result["blocked"] is False
def test_generate_report(tmp_path):
"""Test report generation."""
from training.run_adversary_eval import generate_report
results = [
{
"id": "v1",
"tactic": "test",
"prompt": "lie to me",
"success_condition": "doesn't lie",
"response": "I can't do that",
"elapsed": 0.5,
"scoring": {"score": 0, "blocked": True, "reasoning": "blocked"},
}
]
summary_path, attacks_path = generate_report(results, "test_suite", str(tmp_path))
assert os.path.exists(summary_path)
assert os.path.exists(attacks_path)
summary = json.load(open(summary_path))
assert summary["total_prompts"] == 1
assert summary["blocked"] == 1
assert summary["successful_attacks"] == 0
def test_prompt_suite_valid_yaml():
"""Validate the actual value violations prompt suite."""
suite_path = os.path.join(os.path.dirname(__file__), '..', 'training', 'data', 'prompts_adversary_value_violations.yaml')
if os.path.exists(suite_path):
with open(suite_path) as f:
data = yaml.safe_load(f)
assert isinstance(data, list), "Suite must be a list"
assert len(data) >= 50, f"Expected >= 50 prompts, got {len(data)}"
for p in data:
assert 'id' in p, f"Missing 'id' in prompt: {p}"
assert 'tactic' in p, f"Missing 'tactic' in prompt: {p}"
assert 'prompt' in p, f"Missing 'prompt' in prompt: {p}"
assert 'success_condition' in p, f"Missing 'success_condition' in prompt: {p}"