Files
hermes-agent/tests/test_tensorzero_eval_packet.py
Alexander Whitestone 755e7513a1
All checks were successful
Lint / lint (pull_request) Successful in 37s
docs: add grounded tensorzero evaluation packet (#860)
- add a script that inventories Hermes routing/evaluation surfaces relevant to a TensorZero cutover
- generate a markdown and JSON evaluation packet for issue #860
- score gateway replacement, config migration, canary rollout, session feedback, and eval-suite readiness
- add focused regression tests for touchpoint scanning, requirement scoring, and report rendering

Refs #860
2026-04-22 11:33:31 -04:00

150 lines
5.0 KiB
Python

from pathlib import Path
import sys
SCRIPT_DIR = Path(__file__).resolve().parents[1] / "scripts"
sys.path.insert(0, str(SCRIPT_DIR))
import tensorzero_eval_packet as tz
def test_scan_touchpoints_finds_expected_matches(tmp_path):
(tmp_path / "run_agent.py").write_text(
"self._fallback_chain = []\n# Provider fallback chain\n"
)
(tmp_path / "hermes_cli").mkdir()
(tmp_path / "hermes_cli" / "runtime_provider.py").write_text(
"def resolve_runtime_provider():\n return {}\n"
)
(tmp_path / "agent").mkdir()
(tmp_path / "agent" / "smart_model_routing.py").write_text(
"def resolve_turn_route(user_message, routing_config, primary):\n return primary\n"
)
(tmp_path / "gateway").mkdir()
(tmp_path / "gateway" / "run.py").write_text(
"def _load_provider_routing():\n return {}\n"
)
(tmp_path / "cron").mkdir()
(tmp_path / "cron" / "scheduler.py").write_text(
"runtime = resolve_runtime_provider()\nturn_route = resolve_turn_route('x', {}, {})\n"
)
(tmp_path / "hermes_state.py").write_text("class SessionDB:\n pass\n")
(tmp_path / "benchmarks").mkdir()
(tmp_path / "benchmarks" / "tool_call_benchmark.py").write_text(
"class ToolCall: ...\n"
)
touchpoints = tz.scan_touchpoints(tmp_path)
labels = {tp.label for tp in touchpoints}
assert "fallback_chain" in labels
assert "runtime_provider" in labels
assert "smart_model_routing" in labels
assert "gateway_provider_routing" in labels
assert "cron_runtime_provider" in labels
assert "session_db" in labels
assert "benchmark_suite" in labels
def test_build_requirement_matrix_marks_canary_as_gap_without_split_support():
touchpoints = [
tz.Touchpoint(
label="runtime_provider",
file_path="hermes_cli/runtime_provider.py",
line_number=10,
matched_text="def resolve_runtime_provider",
),
tz.Touchpoint(
label="provider_routing_config",
file_path="cli.py",
line_number=20,
matched_text='provider_routing',
),
tz.Touchpoint(
label="fallback_chain",
file_path="run_agent.py",
line_number=21,
matched_text='_fallback_chain = []',
),
tz.Touchpoint(
label="smart_model_routing",
file_path="agent/smart_model_routing.py",
line_number=30,
matched_text='resolve_turn_route',
),
tz.Touchpoint(
label="gateway_provider_routing",
file_path="gateway/run.py",
line_number=35,
matched_text='def _load_provider_routing',
),
tz.Touchpoint(
label="cron_runtime_provider",
file_path="cron/scheduler.py",
line_number=36,
matched_text='runtime = resolve_runtime_provider()',
),
tz.Touchpoint(
label="session_db",
file_path="hermes_state.py",
line_number=40,
matched_text='class SessionDB',
),
tz.Touchpoint(
label="trajectory_export",
file_path="batch_runner.py",
line_number=50,
matched_text='trajectory_entry',
),
tz.Touchpoint(
label="benchmark_suite",
file_path="benchmarks/tool_call_benchmark.py",
line_number=60,
matched_text='ToolCall',
),
]
matrix = tz.build_requirement_matrix(touchpoints)
by_key = {row.key: row for row in matrix}
assert by_key["gateway_replacement"].status == "partial"
assert by_key["config_migration"].status == "partial"
assert by_key["canary_rollout"].status == "gap"
assert by_key["session_feedback"].status == "partial"
assert by_key["evaluation_suite"].status == "partial"
def test_build_markdown_renders_recommendation_and_touchpoints():
touchpoints = [
tz.Touchpoint(
label="runtime_provider",
file_path="hermes_cli/runtime_provider.py",
line_number=10,
matched_text="def resolve_runtime_provider",
),
tz.Touchpoint(
label="session_db",
file_path="hermes_state.py",
line_number=40,
matched_text='class SessionDB',
),
]
matrix = tz.build_requirement_matrix(touchpoints)
report = tz.build_report(touchpoints, matrix)
markdown = tz.build_markdown(report)
assert "# TensorZero Evaluation Packet" in markdown
assert "gateway_replacement" not in markdown # human labels, not raw keys
assert "Gateway replacement scope" in markdown
assert "Not ready for direct replacement" in markdown
assert "hermes_cli/runtime_provider.py:10" in markdown
assert "hermes_state.py:40" in markdown
def test_issue_context_is_embedded_in_report():
report = tz.build_report([], [])
markdown = tz.build_markdown(report)
assert "Issue #860" in markdown
assert "tensorzero" in markdown.lower()
assert "10% traffic" in markdown