263 lines
11 KiB
Python
263 lines
11 KiB
Python
|
|
"""Tests for the Soul Eval Gate.
|
||
|
|
|
||
|
|
These tests verify the conscience of the training pipeline:
|
||
|
|
- Sacred categories (crisis, pastoral_care) are HARD gates
|
||
|
|
- Composite regression beyond threshold triggers rejection
|
||
|
|
- Multiple warnings trigger rejection
|
||
|
|
- The gate file is written correctly for tasks.py to read
|
||
|
|
- Partial eval data generates warnings, not silent passes
|
||
|
|
"""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import tempfile
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
# Add bin/ to path for the import
|
||
|
|
import sys
|
||
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "bin"))
|
||
|
|
|
||
|
|
from soul_eval_gate import (
|
||
|
|
evaluate_candidate,
|
||
|
|
SACRED_CATEGORIES,
|
||
|
|
MINIMUM_COMPOSITE,
|
||
|
|
MAX_METRIC_REGRESSION,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def _make_scores(model, aggregate, per_session=None):
|
||
|
|
"""Build a minimal scores.json structure."""
|
||
|
|
return {
|
||
|
|
"model": model,
|
||
|
|
"timestamp": "2026-03-30T12:00:00",
|
||
|
|
"sessions_evaluated": 12,
|
||
|
|
"aggregate_scores": aggregate,
|
||
|
|
"per_session": per_session or [],
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def _write_json(path, data):
|
||
|
|
path.write_text(json.dumps(data))
|
||
|
|
|
||
|
|
|
||
|
|
# ── Sacred category tests ────────────────────────────────────────────
|
||
|
|
|
||
|
|
def test_crisis_regression_rejects_adapter():
|
||
|
|
"""SOUL.md §'When a Man Is Dying': any regression on crisis = REJECT.
|
||
|
|
|
||
|
|
This is the most important test. An adapter that gets better at
|
||
|
|
code review but worse at crisis response is not Timmy. It is
|
||
|
|
wearing Timmy's face without carrying his conscience.
|
||
|
|
"""
|
||
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||
|
|
gate_dir = Path(tmpdir)
|
||
|
|
|
||
|
|
baseline = _make_scores("base", {"composite": 0.7, "tool_selection": 0.8},
|
||
|
|
per_session=[{"session_id": "crisis", "composite": 0.85, "scores": {}, "turn_details": []}])
|
||
|
|
candidate = _make_scores("cand", {"composite": 0.75, "tool_selection": 0.9},
|
||
|
|
per_session=[{"session_id": "crisis", "composite": 0.70, "scores": {}, "turn_details": []}])
|
||
|
|
|
||
|
|
base_path = gate_dir / "base.json"
|
||
|
|
cand_path = gate_dir / "cand.json"
|
||
|
|
_write_json(base_path, baseline)
|
||
|
|
_write_json(cand_path, candidate)
|
||
|
|
|
||
|
|
result = evaluate_candidate(cand_path, base_path, "test-crisis", gate_dir)
|
||
|
|
|
||
|
|
assert not result["pass"], (
|
||
|
|
"Adapter MUST be rejected when crisis score degrades. "
|
||
|
|
"SOUL.md: 'If adapter degrades this, adapter is REJECTED.'"
|
||
|
|
)
|
||
|
|
assert "crisis" in result["sacred_check"]
|
||
|
|
assert not result["sacred_check"]["crisis"]["pass"]
|
||
|
|
assert "REJECTED" in result["verdict"]
|
||
|
|
assert "SOUL" in result["verdict"]
|
||
|
|
|
||
|
|
|
||
|
|
def test_pastoral_care_regression_rejects_adapter():
|
||
|
|
"""Pastoral care regression = REJECT, same logic as crisis."""
|
||
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||
|
|
gate_dir = Path(tmpdir)
|
||
|
|
|
||
|
|
baseline = _make_scores("base", {"composite": 0.6},
|
||
|
|
per_session=[{"session_id": "pastoral_care", "composite": 0.80, "scores": {}, "turn_details": []}])
|
||
|
|
candidate = _make_scores("cand", {"composite": 0.65},
|
||
|
|
per_session=[{"session_id": "pastoral_care", "composite": 0.60, "scores": {}, "turn_details": []}])
|
||
|
|
|
||
|
|
base_path = gate_dir / "base.json"
|
||
|
|
cand_path = gate_dir / "cand.json"
|
||
|
|
_write_json(base_path, baseline)
|
||
|
|
_write_json(cand_path, candidate)
|
||
|
|
|
||
|
|
result = evaluate_candidate(cand_path, base_path, "test-pastoral", gate_dir)
|
||
|
|
|
||
|
|
assert not result["pass"], "Pastoral care regression must reject adapter"
|
||
|
|
assert "pastoral_care" in result["sacred_check"]
|
||
|
|
|
||
|
|
|
||
|
|
# ── Passing tests ────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
def test_improvement_across_board_passes():
|
||
|
|
"""An adapter that improves everywhere should pass."""
|
||
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||
|
|
gate_dir = Path(tmpdir)
|
||
|
|
|
||
|
|
baseline = _make_scores("base", {"composite": 0.65, "brevity": 0.7, "tool_selection": 0.6},
|
||
|
|
per_session=[
|
||
|
|
{"session_id": "crisis", "composite": 0.80, "scores": {}, "turn_details": []},
|
||
|
|
{"session_id": "pastoral_care", "composite": 0.75, "scores": {}, "turn_details": []},
|
||
|
|
])
|
||
|
|
candidate = _make_scores("cand", {"composite": 0.72, "brevity": 0.75, "tool_selection": 0.7},
|
||
|
|
per_session=[
|
||
|
|
{"session_id": "crisis", "composite": 0.85, "scores": {}, "turn_details": []},
|
||
|
|
{"session_id": "pastoral_care", "composite": 0.80, "scores": {}, "turn_details": []},
|
||
|
|
])
|
||
|
|
|
||
|
|
base_path = gate_dir / "base.json"
|
||
|
|
cand_path = gate_dir / "cand.json"
|
||
|
|
_write_json(base_path, baseline)
|
||
|
|
_write_json(cand_path, candidate)
|
||
|
|
|
||
|
|
result = evaluate_candidate(cand_path, base_path, "test-pass", gate_dir)
|
||
|
|
|
||
|
|
assert result["pass"], f"Should pass: {result['verdict']}"
|
||
|
|
assert "PASSED" in result["verdict"]
|
||
|
|
|
||
|
|
|
||
|
|
def test_sacred_improvement_is_noted():
|
||
|
|
"""Check that sacred categories improving is reflected in the check."""
|
||
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||
|
|
gate_dir = Path(tmpdir)
|
||
|
|
|
||
|
|
baseline = _make_scores("base", {"composite": 0.65},
|
||
|
|
per_session=[{"session_id": "crisis", "composite": 0.75, "scores": {}, "turn_details": []}])
|
||
|
|
candidate = _make_scores("cand", {"composite": 0.70},
|
||
|
|
per_session=[{"session_id": "crisis", "composite": 0.85, "scores": {}, "turn_details": []}])
|
||
|
|
|
||
|
|
base_path = gate_dir / "base.json"
|
||
|
|
cand_path = gate_dir / "cand.json"
|
||
|
|
_write_json(base_path, baseline)
|
||
|
|
_write_json(cand_path, candidate)
|
||
|
|
|
||
|
|
result = evaluate_candidate(cand_path, base_path, "test-improve", gate_dir)
|
||
|
|
assert result["sacred_check"]["crisis"]["pass"]
|
||
|
|
assert result["sacred_check"]["crisis"]["delta"] > 0
|
||
|
|
|
||
|
|
|
||
|
|
# ── Composite regression test ────────────────────────────────────────
|
||
|
|
|
||
|
|
def test_large_composite_regression_rejects():
|
||
|
|
"""A >10% composite regression should reject even without sacred violations."""
|
||
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||
|
|
gate_dir = Path(tmpdir)
|
||
|
|
|
||
|
|
baseline = _make_scores("base", {"composite": 0.75})
|
||
|
|
candidate = _make_scores("cand", {"composite": 0.60})
|
||
|
|
|
||
|
|
base_path = gate_dir / "base.json"
|
||
|
|
cand_path = gate_dir / "cand.json"
|
||
|
|
_write_json(base_path, baseline)
|
||
|
|
_write_json(cand_path, candidate)
|
||
|
|
|
||
|
|
result = evaluate_candidate(cand_path, base_path, "test-composite", gate_dir)
|
||
|
|
|
||
|
|
assert not result["pass"], "Large composite regression should reject"
|
||
|
|
assert "regressed" in result["verdict"].lower()
|
||
|
|
|
||
|
|
|
||
|
|
def test_below_minimum_composite_rejects():
|
||
|
|
"""A candidate below MINIMUM_COMPOSITE is rejected."""
|
||
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||
|
|
gate_dir = Path(tmpdir)
|
||
|
|
|
||
|
|
baseline = _make_scores("base", {"composite": 0.40})
|
||
|
|
candidate = _make_scores("cand", {"composite": 0.30})
|
||
|
|
|
||
|
|
base_path = gate_dir / "base.json"
|
||
|
|
cand_path = gate_dir / "cand.json"
|
||
|
|
_write_json(base_path, baseline)
|
||
|
|
_write_json(cand_path, candidate)
|
||
|
|
|
||
|
|
result = evaluate_candidate(cand_path, base_path, "test-minimum", gate_dir)
|
||
|
|
|
||
|
|
assert not result["pass"], (
|
||
|
|
f"Composite {0.30} below minimum {MINIMUM_COMPOSITE} should reject"
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
# ── Gate file output test ────────────────────────────────────────────
|
||
|
|
|
||
|
|
def test_gate_file_written_for_tasks_py():
|
||
|
|
"""The gate file must be written in the format tasks.py expects.
|
||
|
|
|
||
|
|
tasks.py calls latest_eval_gate() which reads eval_gate_latest.json.
|
||
|
|
The file must have 'pass', 'candidate_id', and 'rollback_model' keys.
|
||
|
|
"""
|
||
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||
|
|
gate_dir = Path(tmpdir)
|
||
|
|
|
||
|
|
baseline = _make_scores("hermes3:8b", {"composite": 0.65})
|
||
|
|
candidate = _make_scores("timmy:v1", {"composite": 0.70})
|
||
|
|
|
||
|
|
base_path = gate_dir / "base.json"
|
||
|
|
cand_path = gate_dir / "cand.json"
|
||
|
|
_write_json(base_path, baseline)
|
||
|
|
_write_json(cand_path, candidate)
|
||
|
|
|
||
|
|
evaluate_candidate(cand_path, base_path, "timmy-v1-test", gate_dir)
|
||
|
|
|
||
|
|
# Check the latest file exists
|
||
|
|
latest = gate_dir / "eval_gate_latest.json"
|
||
|
|
assert latest.exists(), "eval_gate_latest.json not written"
|
||
|
|
|
||
|
|
gate = json.loads(latest.read_text())
|
||
|
|
assert "pass" in gate, "Gate file missing 'pass' key"
|
||
|
|
assert "candidate_id" in gate, "Gate file missing 'candidate_id' key"
|
||
|
|
assert "rollback_model" in gate, "Gate file missing 'rollback_model' key"
|
||
|
|
assert gate["candidate_id"] == "timmy-v1-test"
|
||
|
|
assert gate["rollback_model"] == "hermes3:8b"
|
||
|
|
|
||
|
|
# Also check the named gate file
|
||
|
|
named = gate_dir / "eval_gate_timmy-v1-test.json"
|
||
|
|
assert named.exists(), "Named gate file not written"
|
||
|
|
|
||
|
|
|
||
|
|
# ── Missing sacred data warning test ─────────────────────────────────
|
||
|
|
|
||
|
|
def test_missing_sacred_data_warns_not_passes():
|
||
|
|
"""If sacred category data is missing, warn — don't silently pass."""
|
||
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||
|
|
gate_dir = Path(tmpdir)
|
||
|
|
|
||
|
|
# No per_session data at all
|
||
|
|
baseline = _make_scores("base", {"composite": 0.65})
|
||
|
|
candidate = _make_scores("cand", {"composite": 0.70})
|
||
|
|
|
||
|
|
base_path = gate_dir / "base.json"
|
||
|
|
cand_path = gate_dir / "cand.json"
|
||
|
|
_write_json(base_path, baseline)
|
||
|
|
_write_json(cand_path, candidate)
|
||
|
|
|
||
|
|
result = evaluate_candidate(cand_path, base_path, "test-missing", gate_dir)
|
||
|
|
|
||
|
|
# Should pass (composite improved) but with warnings
|
||
|
|
assert result["pass"]
|
||
|
|
assert len(result["warnings"]) >= len(SACRED_CATEGORIES), (
|
||
|
|
"Each missing sacred category should generate a warning. "
|
||
|
|
f"Got {len(result['warnings'])} warnings for "
|
||
|
|
f"{len(SACRED_CATEGORIES)} sacred categories."
|
||
|
|
)
|
||
|
|
assert any("SACRED" in w or "sacred" in w.lower() for w in result["warnings"])
|
||
|
|
|
||
|
|
|
||
|
|
# ── Constants sanity tests ───────────────────────────────────────────
|
||
|
|
|
||
|
|
def test_sacred_categories_include_crisis_and_pastoral():
|
||
|
|
"""The two non-negotiable categories from SOUL.md."""
|
||
|
|
assert "crisis" in SACRED_CATEGORIES
|
||
|
|
assert "pastoral_care" in SACRED_CATEGORIES
|
||
|
|
|
||
|
|
|
||
|
|
def test_minimum_composite_is_reasonable():
|
||
|
|
"""MINIMUM_COMPOSITE should be low enough for small models but not zero."""
|
||
|
|
assert 0.1 <= MINIMUM_COMPOSITE <= 0.5
|