All checks were successful
Lint / lint (pull_request) Successful in 37s
- add a script that inventories Hermes routing/evaluation surfaces relevant to a TensorZero cutover - generate a markdown and JSON evaluation packet for issue #860 - score gateway replacement, config migration, canary rollout, session feedback, and eval-suite readiness - add focused regression tests for touchpoint scanning, requirement scoring, and report rendering Refs #860
319 lines
13 KiB
Python
319 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""Generate a grounded TensorZero evaluation packet for Hermes.
|
|
|
|
This script inventories the current Hermes routing/evaluation surfaces, then
|
|
builds a markdown packet assessing how much of issue #860 can be satisfied by
|
|
TensorZero and where the migration risk still lives.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
from dataclasses import asdict, dataclass
|
|
from pathlib import Path
|
|
from typing import Iterable
|
|
|
|
ISSUE_NUMBER = 860
|
|
ISSUE_TITLE = "tensorzero LLMOps platform evaluation"
|
|
ISSUE_URL = "https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/860"
|
|
DEFAULT_OUTPUT = Path("docs/evaluations/tensorzero-860-evaluation.md")
|
|
DEFAULT_JSON_OUTPUT = Path("docs/evaluations/tensorzero-860-evaluation.json")
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class TouchpointPattern:
|
|
label: str
|
|
file_path: str
|
|
regex: str
|
|
description: str
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Touchpoint:
|
|
label: str
|
|
file_path: str
|
|
line_number: int
|
|
matched_text: str
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class RequirementStatus:
|
|
key: str
|
|
name: str
|
|
status: str
|
|
evidence_labels: tuple[str, ...]
|
|
summary: str
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class EvaluationReport:
|
|
issue_number: int
|
|
issue_title: str
|
|
issue_url: str
|
|
recommendation: str
|
|
touchpoints: tuple[Touchpoint, ...]
|
|
requirements: tuple[RequirementStatus, ...]
|
|
|
|
|
|
PATTERNS: tuple[TouchpointPattern, ...] = (
|
|
TouchpointPattern(
|
|
label="fallback_chain",
|
|
file_path="run_agent.py",
|
|
regex=r"_fallback_chain|fallback_providers|fallback_model",
|
|
description="Primary agent fallback-provider chain in the core conversation loop.",
|
|
),
|
|
TouchpointPattern(
|
|
label="provider_routing_config",
|
|
file_path="cli.py",
|
|
regex=r"provider_routing|fallback_providers|smart_model_routing",
|
|
description="CLI-owned provider routing and fallback configuration surfaces.",
|
|
),
|
|
TouchpointPattern(
|
|
label="runtime_provider",
|
|
file_path="hermes_cli/runtime_provider.py",
|
|
regex=r"def resolve_runtime_provider|def resolve_requested_provider",
|
|
description="Central runtime provider resolution for CLI, gateway, cron, and helpers.",
|
|
),
|
|
TouchpointPattern(
|
|
label="smart_model_routing",
|
|
file_path="agent/smart_model_routing.py",
|
|
regex=r"def resolve_turn_route|def choose_cheap_model_route",
|
|
description="Cheap-vs-strong turn routing that TensorZero would need to absorb or replace.",
|
|
),
|
|
TouchpointPattern(
|
|
label="gateway_provider_routing",
|
|
file_path="gateway/run.py",
|
|
regex=r"def _load_provider_routing|def _load_fallback_model|def _load_smart_model_routing",
|
|
description="Gateway-specific loading of routing, fallback, and smart-model policies.",
|
|
),
|
|
TouchpointPattern(
|
|
label="cron_runtime_provider",
|
|
file_path="cron/scheduler.py",
|
|
regex=r"resolve_runtime_provider|resolve_turn_route|provider_routing|fallback_model",
|
|
description="Cron execution path that re-resolves providers and routing on every run.",
|
|
),
|
|
TouchpointPattern(
|
|
label="auxiliary_fallback_chain",
|
|
file_path="agent/auxiliary_client.py",
|
|
regex=r"fallback chain|_get_provider_chain|provider chain",
|
|
description="Auxiliary task routing/fallback chain outside the main inference path.",
|
|
),
|
|
TouchpointPattern(
|
|
label="delegate_runtime_provider",
|
|
file_path="tools/delegate_tool.py",
|
|
regex=r"runtime provider system|resolve the full credential bundle|resolve_runtime_provider",
|
|
description="Subagent/delegation routing path that would also need TensorZero parity.",
|
|
),
|
|
TouchpointPattern(
|
|
label="session_db",
|
|
file_path="hermes_state.py",
|
|
regex=r"class SessionDB",
|
|
description="Session persistence surface that could feed TensorZero optimization/eval data.",
|
|
),
|
|
TouchpointPattern(
|
|
label="trajectory_export",
|
|
file_path="batch_runner.py",
|
|
regex=r"trajectory_entry|save_trajectories|_convert_to_trajectory_format",
|
|
description="Trajectory export surface for offline optimization and replay data.",
|
|
),
|
|
TouchpointPattern(
|
|
label="benchmark_suite",
|
|
file_path="benchmarks/tool_call_benchmark.py",
|
|
regex=r"ToolCall\(|class ToolCall|benchmark",
|
|
description="Existing benchmark/evaluation harness that could map to TensorZero experiments.",
|
|
),
|
|
)
|
|
|
|
|
|
def _iter_matches(pattern: TouchpointPattern, text: str) -> Iterable[Touchpoint]:
|
|
regex = re.compile(pattern.regex, re.IGNORECASE)
|
|
for line_number, line in enumerate(text.splitlines(), start=1):
|
|
if regex.search(line):
|
|
yield Touchpoint(
|
|
label=pattern.label,
|
|
file_path=pattern.file_path,
|
|
line_number=line_number,
|
|
matched_text=line.strip(),
|
|
)
|
|
|
|
|
|
def scan_touchpoints(repo_root: Path) -> list[Touchpoint]:
|
|
touchpoints: list[Touchpoint] = []
|
|
for pattern in PATTERNS:
|
|
path = repo_root / pattern.file_path
|
|
if not path.exists():
|
|
continue
|
|
text = path.read_text(encoding="utf-8")
|
|
touchpoints.extend(_iter_matches(pattern, text))
|
|
return touchpoints
|
|
|
|
|
|
def build_requirement_matrix(touchpoints: list[Touchpoint]) -> list[RequirementStatus]:
|
|
labels = {tp.label for tp in touchpoints}
|
|
|
|
matrix: list[RequirementStatus] = []
|
|
gateway_labels = (
|
|
"fallback_chain",
|
|
"runtime_provider",
|
|
"gateway_provider_routing",
|
|
"cron_runtime_provider",
|
|
"auxiliary_fallback_chain",
|
|
"delegate_runtime_provider",
|
|
)
|
|
gateway_hits = tuple(label for label in gateway_labels if label in labels)
|
|
gateway_status = "partial" if len(gateway_hits) >= 4 else "gap"
|
|
gateway_summary = (
|
|
"Hermes already spreads provider routing across core agent, runtime provider, gateway, cron, auxiliary, and delegation seams; "
|
|
"TensorZero would need parity across all of them before it can replace the gateway layer."
|
|
if gateway_hits else
|
|
"No grounded routing surfaces were found for a gateway replacement assessment."
|
|
)
|
|
matrix.append(RequirementStatus("gateway_replacement", "Gateway replacement scope", gateway_status, gateway_hits, gateway_summary))
|
|
|
|
config_labels = (
|
|
"provider_routing_config",
|
|
"runtime_provider",
|
|
"smart_model_routing",
|
|
"fallback_chain",
|
|
)
|
|
config_hits = tuple(label for label in config_labels if label in labels)
|
|
config_status = "partial" if len(config_hits) >= 3 else "gap"
|
|
config_summary = (
|
|
"Hermes has multiple config concepts to migrate (`provider_routing`, `fallback_providers`, `smart_model_routing`, runtime provider resolution), "
|
|
"so TensorZero is not a drop-in config swap."
|
|
if config_hits else
|
|
"No current config migration surface was found."
|
|
)
|
|
matrix.append(RequirementStatus("config_migration", "Config migration", config_status, config_hits, config_summary))
|
|
|
|
canary_hits: tuple[str, ...] = tuple()
|
|
canary_summary = (
|
|
"The repo shows semantic routing and fallback, but no grounded 10% traffic-split canary mechanism. "
|
|
"A TensorZero cutover would need new percentage-based rollout controls and observability hooks."
|
|
)
|
|
matrix.append(RequirementStatus("canary_rollout", "10% traffic canary", "gap", canary_hits, canary_summary))
|
|
|
|
session_labels = ("session_db", "trajectory_export")
|
|
session_hits = tuple(label for label in session_labels if label in labels)
|
|
session_status = "partial" if len(session_hits) == len(session_labels) else "gap"
|
|
session_summary = (
|
|
"Hermes already has SessionDB and trajectory export surfaces that can feed offline optimization data, "
|
|
"but not a TensorZero-native ingestion path yet."
|
|
if session_hits else
|
|
"No session-data surface was found for prompt optimization."
|
|
)
|
|
matrix.append(RequirementStatus("session_feedback", "Session data for prompt optimization", session_status, session_hits, session_summary))
|
|
|
|
eval_labels = ("benchmark_suite", "trajectory_export")
|
|
eval_hits = tuple(label for label in eval_labels if label in labels)
|
|
eval_status = "partial" if "benchmark_suite" in eval_hits else "gap"
|
|
eval_summary = (
|
|
"Hermes already has benchmark/trajectory machinery that can seed TensorZero A/B evaluation, "
|
|
"but no integrated TensorZero experiment runner or live evaluation gateway."
|
|
if eval_hits else
|
|
"No evaluation harness was found to support TensorZero A/B testing."
|
|
)
|
|
matrix.append(RequirementStatus("evaluation_suite", "Evaluation suite / A/B testing", eval_status, eval_hits, eval_summary))
|
|
|
|
return matrix
|
|
|
|
|
|
def build_report(touchpoints: list[Touchpoint], requirement_matrix: list[RequirementStatus]) -> EvaluationReport:
|
|
recommendation = (
|
|
"Not ready for direct replacement. Recommend a shadow-evaluation phase first: keep Hermes routing live, "
|
|
"inventory the migration seams, export SessionDB/trajectory data into an offline TensorZero experiment loop, "
|
|
"and only design a canary gateway once percentage-based rollout controls exist."
|
|
)
|
|
return EvaluationReport(
|
|
issue_number=ISSUE_NUMBER,
|
|
issue_title=ISSUE_TITLE,
|
|
issue_url=ISSUE_URL,
|
|
recommendation=recommendation,
|
|
touchpoints=tuple(touchpoints),
|
|
requirements=tuple(requirement_matrix),
|
|
)
|
|
|
|
|
|
def build_markdown(report: EvaluationReport) -> str:
|
|
lines: list[str] = []
|
|
lines.append("# TensorZero Evaluation Packet")
|
|
lines.append("")
|
|
lines.append(f"Issue #{report.issue_number}: [{report.issue_title}]({report.issue_url})")
|
|
lines.append("")
|
|
lines.append("## Scope")
|
|
lines.append("")
|
|
lines.append("This packet evaluates TensorZero as a possible replacement for Hermes' custom provider-routing stack.")
|
|
lines.append("It is intentionally grounded in the current repo state rather than a speculative cutover plan.")
|
|
lines.append("")
|
|
lines.append("## Issue requirements being evaluated")
|
|
lines.append("")
|
|
lines.append("- Deploy tensorzero gateway (Rust binary)")
|
|
lines.append("- Migrate provider routing config")
|
|
lines.append("- Test with canary (10% traffic) before full cutover")
|
|
lines.append("- Feed session data for prompt optimization")
|
|
lines.append("- Evaluation suite for A/B testing models")
|
|
lines.append("")
|
|
lines.append("## Recommendation")
|
|
lines.append("")
|
|
lines.append(report.recommendation)
|
|
lines.append("")
|
|
lines.append("## Requirement matrix")
|
|
lines.append("")
|
|
lines.append("| Requirement | Status | Evidence labels | Summary |")
|
|
lines.append("| --- | --- | --- | --- |")
|
|
for row in report.requirements:
|
|
evidence = ", ".join(row.evidence_labels) if row.evidence_labels else "—"
|
|
lines.append(f"| {row.name} | {row.status} | {evidence} | {row.summary} |")
|
|
lines.append("")
|
|
lines.append("## Grounded Hermes touchpoints")
|
|
lines.append("")
|
|
if report.touchpoints:
|
|
for tp in report.touchpoints:
|
|
lines.append(f"- `{tp.file_path}:{tp.line_number}` — [{tp.label}] {tp.matched_text}")
|
|
else:
|
|
lines.append("- No routing/evaluation touchpoints were found.")
|
|
lines.append("")
|
|
lines.append("## Suggested next slice")
|
|
lines.append("")
|
|
lines.append("1. Build an exporter that emits SessionDB + trajectory data into a TensorZero-friendly offline dataset.")
|
|
lines.append("2. Define percentage-based canary controls before attempting any gateway replacement.")
|
|
lines.append("3. Keep Hermes routing authoritative until TensorZero proves parity across CLI, gateway, cron, auxiliary, and delegation surfaces.")
|
|
lines.append("")
|
|
return "\n".join(lines).rstrip() + "\n"
|
|
|
|
|
|
def write_outputs(report: EvaluationReport, markdown_path: Path, json_path: Path | None = None) -> None:
|
|
markdown_path.parent.mkdir(parents=True, exist_ok=True)
|
|
markdown_path.write_text(build_markdown(report), encoding="utf-8")
|
|
if json_path is not None:
|
|
json_path.parent.mkdir(parents=True, exist_ok=True)
|
|
json_path.write_text(json.dumps(asdict(report), indent=2), encoding="utf-8")
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(description="Generate a grounded TensorZero evaluation packet for Hermes")
|
|
parser.add_argument("--repo-root", default=".", help="Hermes repo root to scan")
|
|
parser.add_argument("--output", default=str(DEFAULT_OUTPUT), help="Markdown output path")
|
|
parser.add_argument("--json-output", default=str(DEFAULT_JSON_OUTPUT), help="Optional JSON output path")
|
|
return parser.parse_args()
|
|
|
|
|
|
def main() -> int:
|
|
args = parse_args()
|
|
repo_root = Path(args.repo_root).resolve()
|
|
touchpoints = scan_touchpoints(repo_root)
|
|
matrix = build_requirement_matrix(touchpoints)
|
|
report = build_report(touchpoints, matrix)
|
|
json_output = Path(args.json_output) if args.json_output else None
|
|
write_outputs(report, Path(args.output), json_output)
|
|
print(f"Wrote {args.output}")
|
|
if json_output is not None:
|
|
print(f"Wrote {json_output}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|