70 lines
2.2 KiB
Python
70 lines
2.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Evaluate whether the latest archive candidate is promotable."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
from pathlib import Path
|
|
|
|
from .common import TRAINING_EVALS_DIR
|
|
|
|
|
|
def load_eval(path: Path) -> dict:
|
|
return json.loads(path.read_text())
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Evaluate archive candidate promotion gates")
|
|
parser.add_argument("--eval-file", default="", help="Specific eval JSON file to inspect")
|
|
args = parser.parse_args()
|
|
|
|
if args.eval_file:
|
|
eval_path = Path(args.eval_file).expanduser()
|
|
else:
|
|
eval_files = sorted(TRAINING_EVALS_DIR.glob("run_*.json"))
|
|
if not eval_files:
|
|
print(json.dumps({"pass": False, "reason": "missing eval file"}))
|
|
return
|
|
eval_path = eval_files[-1]
|
|
|
|
payload = load_eval(eval_path)
|
|
baseline = float(payload.get("baseline_composite", 0.0))
|
|
candidate = float(payload.get("candidate_composite", 0.0))
|
|
improvement = 0.0
|
|
if baseline > 0:
|
|
improvement = (candidate - baseline) / baseline
|
|
|
|
refusal_regression = bool(payload.get("refusal_over_fabrication_regression", True))
|
|
source_regression = bool(payload.get("source_distinction_regression", True))
|
|
evidence_rate = float(payload.get("evidence_citation_rate", 0.0))
|
|
|
|
gate_pass = (
|
|
improvement >= 0.05
|
|
and not refusal_regression
|
|
and not source_regression
|
|
and evidence_rate >= 0.95
|
|
)
|
|
|
|
print(
|
|
json.dumps(
|
|
{
|
|
"pass": gate_pass,
|
|
"eval_file": eval_path.name,
|
|
"candidate_id": payload.get("candidate_id"),
|
|
"baseline_composite": baseline,
|
|
"candidate_composite": candidate,
|
|
"improvement_pct": round(improvement * 100, 2),
|
|
"evidence_citation_rate": evidence_rate,
|
|
"refusal_over_fabrication_regression": refusal_regression,
|
|
"source_distinction_regression": source_regression,
|
|
"rollback_model": payload.get("rollback_model"),
|
|
},
|
|
sort_keys=True,
|
|
)
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|