All checks were successful
Lint / lint (pull_request) Successful in 37s
- add a script that inventories Hermes routing/evaluation surfaces relevant to a TensorZero cutover - generate a markdown and JSON evaluation packet for issue #860 - score gateway replacement, config migration, canary rollout, session feedback, and eval-suite readiness - add focused regression tests for touchpoint scanning, requirement scoring, and report rendering Refs #860
150 lines
5.0 KiB
Python
150 lines
5.0 KiB
Python
from pathlib import Path
|
|
import sys
|
|
|
|
SCRIPT_DIR = Path(__file__).resolve().parents[1] / "scripts"
|
|
sys.path.insert(0, str(SCRIPT_DIR))
|
|
|
|
import tensorzero_eval_packet as tz
|
|
|
|
|
|
def test_scan_touchpoints_finds_expected_matches(tmp_path):
|
|
(tmp_path / "run_agent.py").write_text(
|
|
"self._fallback_chain = []\n# Provider fallback chain\n"
|
|
)
|
|
(tmp_path / "hermes_cli").mkdir()
|
|
(tmp_path / "hermes_cli" / "runtime_provider.py").write_text(
|
|
"def resolve_runtime_provider():\n return {}\n"
|
|
)
|
|
(tmp_path / "agent").mkdir()
|
|
(tmp_path / "agent" / "smart_model_routing.py").write_text(
|
|
"def resolve_turn_route(user_message, routing_config, primary):\n return primary\n"
|
|
)
|
|
(tmp_path / "gateway").mkdir()
|
|
(tmp_path / "gateway" / "run.py").write_text(
|
|
"def _load_provider_routing():\n return {}\n"
|
|
)
|
|
(tmp_path / "cron").mkdir()
|
|
(tmp_path / "cron" / "scheduler.py").write_text(
|
|
"runtime = resolve_runtime_provider()\nturn_route = resolve_turn_route('x', {}, {})\n"
|
|
)
|
|
(tmp_path / "hermes_state.py").write_text("class SessionDB:\n pass\n")
|
|
(tmp_path / "benchmarks").mkdir()
|
|
(tmp_path / "benchmarks" / "tool_call_benchmark.py").write_text(
|
|
"class ToolCall: ...\n"
|
|
)
|
|
|
|
touchpoints = tz.scan_touchpoints(tmp_path)
|
|
|
|
labels = {tp.label for tp in touchpoints}
|
|
assert "fallback_chain" in labels
|
|
assert "runtime_provider" in labels
|
|
assert "smart_model_routing" in labels
|
|
assert "gateway_provider_routing" in labels
|
|
assert "cron_runtime_provider" in labels
|
|
assert "session_db" in labels
|
|
assert "benchmark_suite" in labels
|
|
|
|
|
|
def test_build_requirement_matrix_marks_canary_as_gap_without_split_support():
|
|
touchpoints = [
|
|
tz.Touchpoint(
|
|
label="runtime_provider",
|
|
file_path="hermes_cli/runtime_provider.py",
|
|
line_number=10,
|
|
matched_text="def resolve_runtime_provider",
|
|
),
|
|
tz.Touchpoint(
|
|
label="provider_routing_config",
|
|
file_path="cli.py",
|
|
line_number=20,
|
|
matched_text='provider_routing',
|
|
),
|
|
tz.Touchpoint(
|
|
label="fallback_chain",
|
|
file_path="run_agent.py",
|
|
line_number=21,
|
|
matched_text='_fallback_chain = []',
|
|
),
|
|
tz.Touchpoint(
|
|
label="smart_model_routing",
|
|
file_path="agent/smart_model_routing.py",
|
|
line_number=30,
|
|
matched_text='resolve_turn_route',
|
|
),
|
|
tz.Touchpoint(
|
|
label="gateway_provider_routing",
|
|
file_path="gateway/run.py",
|
|
line_number=35,
|
|
matched_text='def _load_provider_routing',
|
|
),
|
|
tz.Touchpoint(
|
|
label="cron_runtime_provider",
|
|
file_path="cron/scheduler.py",
|
|
line_number=36,
|
|
matched_text='runtime = resolve_runtime_provider()',
|
|
),
|
|
tz.Touchpoint(
|
|
label="session_db",
|
|
file_path="hermes_state.py",
|
|
line_number=40,
|
|
matched_text='class SessionDB',
|
|
),
|
|
tz.Touchpoint(
|
|
label="trajectory_export",
|
|
file_path="batch_runner.py",
|
|
line_number=50,
|
|
matched_text='trajectory_entry',
|
|
),
|
|
tz.Touchpoint(
|
|
label="benchmark_suite",
|
|
file_path="benchmarks/tool_call_benchmark.py",
|
|
line_number=60,
|
|
matched_text='ToolCall',
|
|
),
|
|
]
|
|
|
|
matrix = tz.build_requirement_matrix(touchpoints)
|
|
by_key = {row.key: row for row in matrix}
|
|
|
|
assert by_key["gateway_replacement"].status == "partial"
|
|
assert by_key["config_migration"].status == "partial"
|
|
assert by_key["canary_rollout"].status == "gap"
|
|
assert by_key["session_feedback"].status == "partial"
|
|
assert by_key["evaluation_suite"].status == "partial"
|
|
|
|
|
|
def test_build_markdown_renders_recommendation_and_touchpoints():
|
|
touchpoints = [
|
|
tz.Touchpoint(
|
|
label="runtime_provider",
|
|
file_path="hermes_cli/runtime_provider.py",
|
|
line_number=10,
|
|
matched_text="def resolve_runtime_provider",
|
|
),
|
|
tz.Touchpoint(
|
|
label="session_db",
|
|
file_path="hermes_state.py",
|
|
line_number=40,
|
|
matched_text='class SessionDB',
|
|
),
|
|
]
|
|
matrix = tz.build_requirement_matrix(touchpoints)
|
|
report = tz.build_report(touchpoints, matrix)
|
|
markdown = tz.build_markdown(report)
|
|
|
|
assert "# TensorZero Evaluation Packet" in markdown
|
|
assert "gateway_replacement" not in markdown # human labels, not raw keys
|
|
assert "Gateway replacement scope" in markdown
|
|
assert "Not ready for direct replacement" in markdown
|
|
assert "hermes_cli/runtime_provider.py:10" in markdown
|
|
assert "hermes_state.py:40" in markdown
|
|
|
|
|
|
def test_issue_context_is_embedded_in_report():
|
|
report = tz.build_report([], [])
|
|
markdown = tz.build_markdown(report)
|
|
|
|
assert "Issue #860" in markdown
|
|
assert "tensorzero" in markdown.lower()
|
|
assert "10% traffic" in markdown
|