Files
timmy-home/scripts/twitter_archive/evaluate_candidate.py
2026-03-27 18:09:28 -04:00

70 lines
2.2 KiB
Python

#!/usr/bin/env python3
"""Evaluate whether the latest archive candidate is promotable."""
from __future__ import annotations
import argparse
import json
from pathlib import Path
from .common import TRAINING_EVALS_DIR
def load_eval(path: Path) -> dict:
return json.loads(path.read_text())
def main() -> None:
parser = argparse.ArgumentParser(description="Evaluate archive candidate promotion gates")
parser.add_argument("--eval-file", default="", help="Specific eval JSON file to inspect")
args = parser.parse_args()
if args.eval_file:
eval_path = Path(args.eval_file).expanduser()
else:
eval_files = sorted(TRAINING_EVALS_DIR.glob("run_*.json"))
if not eval_files:
print(json.dumps({"pass": False, "reason": "missing eval file"}))
return
eval_path = eval_files[-1]
payload = load_eval(eval_path)
baseline = float(payload.get("baseline_composite", 0.0))
candidate = float(payload.get("candidate_composite", 0.0))
improvement = 0.0
if baseline > 0:
improvement = (candidate - baseline) / baseline
refusal_regression = bool(payload.get("refusal_over_fabrication_regression", True))
source_regression = bool(payload.get("source_distinction_regression", True))
evidence_rate = float(payload.get("evidence_citation_rate", 0.0))
gate_pass = (
improvement >= 0.05
and not refusal_regression
and not source_regression
and evidence_rate >= 0.95
)
print(
json.dumps(
{
"pass": gate_pass,
"eval_file": eval_path.name,
"candidate_id": payload.get("candidate_id"),
"baseline_composite": baseline,
"candidate_composite": candidate,
"improvement_pct": round(improvement * 100, 2),
"evidence_citation_rate": evidence_rate,
"refusal_over_fabrication_regression": refusal_regression,
"source_distinction_regression": source_regression,
"rollback_model": payload.get("rollback_model"),
},
sort_keys=True,
)
)
if __name__ == "__main__":
main()