hermes-agent/scripts/tensorzero_eval_packet.py

#!/usr/bin/env python3
"""Generate a grounded TensorZero evaluation packet for Hermes.

This script inventories the current Hermes routing/evaluation surfaces, then
builds a markdown packet assessing how much of issue #860 can be satisfied by
TensorZero and where the migration risk still lives.
"""

from __future__ import annotations

import argparse
import json
import re
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Iterable

ISSUE_NUMBER = 860
ISSUE_TITLE = "tensorzero LLMOps platform evaluation"
ISSUE_URL = "https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/860"
DEFAULT_OUTPUT = Path("docs/evaluations/tensorzero-860-evaluation.md")
DEFAULT_JSON_OUTPUT = Path("docs/evaluations/tensorzero-860-evaluation.json")


@dataclass(frozen=True)
class TouchpointPattern:
    label: str
    file_path: str
    regex: str
    description: str


@dataclass(frozen=True)
class Touchpoint:
    label: str
    file_path: str
    line_number: int
    matched_text: str


@dataclass(frozen=True)
class RequirementStatus:
    key: str
    name: str
    status: str
    evidence_labels: tuple[str, ...]
    summary: str


@dataclass(frozen=True)
class EvaluationReport:
    issue_number: int
    issue_title: str
    issue_url: str
    recommendation: str
    touchpoints: tuple[Touchpoint, ...]
    requirements: tuple[RequirementStatus, ...]


PATTERNS: tuple[TouchpointPattern, ...] = (
    TouchpointPattern(
        label="fallback_chain",
        file_path="run_agent.py",
        regex=r"_fallback_chain|fallback_providers|fallback_model",
        description="Primary agent fallback-provider chain in the core conversation loop.",
    ),
    TouchpointPattern(
        label="provider_routing_config",
        file_path="cli.py",
        regex=r"provider_routing|fallback_providers|smart_model_routing",
        description="CLI-owned provider routing and fallback configuration surfaces.",
    ),
    TouchpointPattern(
        label="runtime_provider",
        file_path="hermes_cli/runtime_provider.py",
        regex=r"def resolve_runtime_provider|def resolve_requested_provider",
        description="Central runtime provider resolution for CLI, gateway, cron, and helpers.",
    ),
    TouchpointPattern(
        label="smart_model_routing",
        file_path="agent/smart_model_routing.py",
        regex=r"def resolve_turn_route|def choose_cheap_model_route",
        description="Cheap-vs-strong turn routing that TensorZero would need to absorb or replace.",
    ),
    TouchpointPattern(
        label="gateway_provider_routing",
        file_path="gateway/run.py",
        regex=r"def _load_provider_routing|def _load_fallback_model|def _load_smart_model_routing",
        description="Gateway-specific loading of routing, fallback, and smart-model policies.",
    ),
    TouchpointPattern(
        label="cron_runtime_provider",
        file_path="cron/scheduler.py",
        regex=r"resolve_runtime_provider|resolve_turn_route|provider_routing|fallback_model",
        description="Cron execution path that re-resolves providers and routing on every run.",
    ),
    TouchpointPattern(
        label="auxiliary_fallback_chain",
        file_path="agent/auxiliary_client.py",
        regex=r"fallback chain|_get_provider_chain|provider chain",
        description="Auxiliary task routing/fallback chain outside the main inference path.",
    ),
    TouchpointPattern(
        label="delegate_runtime_provider",
        file_path="tools/delegate_tool.py",
        regex=r"runtime provider system|resolve the full credential bundle|resolve_runtime_provider",
        description="Subagent/delegation routing path that would also need TensorZero parity.",
    ),
    TouchpointPattern(
        label="session_db",
        file_path="hermes_state.py",
        regex=r"class SessionDB",
        description="Session persistence surface that could feed TensorZero optimization/eval data.",
    ),
    TouchpointPattern(
        label="trajectory_export",
        file_path="batch_runner.py",
        regex=r"trajectory_entry|save_trajectories|_convert_to_trajectory_format",
        description="Trajectory export surface for offline optimization and replay data.",
    ),
    TouchpointPattern(
        label="benchmark_suite",
        file_path="benchmarks/tool_call_benchmark.py",
        regex=r"ToolCall\(|class ToolCall|benchmark",
        description="Existing benchmark/evaluation harness that could map to TensorZero experiments.",
    ),
)


def _iter_matches(pattern: TouchpointPattern, text: str) -> Iterable[Touchpoint]:
    regex = re.compile(pattern.regex, re.IGNORECASE)
    for line_number, line in enumerate(text.splitlines(), start=1):
        if regex.search(line):
            yield Touchpoint(
                label=pattern.label,
                file_path=pattern.file_path,
                line_number=line_number,
                matched_text=line.strip(),
            )


def scan_touchpoints(repo_root: Path) -> list[Touchpoint]:
    touchpoints: list[Touchpoint] = []
    for pattern in PATTERNS:
        path = repo_root / pattern.file_path
        if not path.exists():
            continue
        text = path.read_text(encoding="utf-8")
        touchpoints.extend(_iter_matches(pattern, text))
    return touchpoints


def build_requirement_matrix(touchpoints: list[Touchpoint]) -> list[RequirementStatus]:
    labels = {tp.label for tp in touchpoints}

    matrix: list[RequirementStatus] = []
    gateway_labels = (
        "fallback_chain",
        "runtime_provider",
        "gateway_provider_routing",
        "cron_runtime_provider",
        "auxiliary_fallback_chain",
        "delegate_runtime_provider",
    )
    gateway_hits = tuple(label for label in gateway_labels if label in labels)
    gateway_status = "partial" if len(gateway_hits) >= 4 else "gap"
    gateway_summary = (
        "Hermes already spreads provider routing across core agent, runtime provider, gateway, cron, auxiliary, and delegation seams; "
        "TensorZero would need parity across all of them before it can replace the gateway layer."
        if gateway_hits else
        "No grounded routing surfaces were found for a gateway replacement assessment."
    )
    matrix.append(RequirementStatus("gateway_replacement", "Gateway replacement scope", gateway_status, gateway_hits, gateway_summary))

    config_labels = (
        "provider_routing_config",
        "runtime_provider",
        "smart_model_routing",
        "fallback_chain",
    )
    config_hits = tuple(label for label in config_labels if label in labels)
    config_status = "partial" if len(config_hits) >= 3 else "gap"
    config_summary = (
        "Hermes has multiple config concepts to migrate (`provider_routing`, `fallback_providers`, `smart_model_routing`, runtime provider resolution), "
        "so TensorZero is not a drop-in config swap."
        if config_hits else
        "No current config migration surface was found."
    )
    matrix.append(RequirementStatus("config_migration", "Config migration", config_status, config_hits, config_summary))

    canary_hits: tuple[str, ...] = tuple()
    canary_summary = (
        "The repo shows semantic routing and fallback, but no grounded 10% traffic-split canary mechanism. "
        "A TensorZero cutover would need new percentage-based rollout controls and observability hooks."
    )
    matrix.append(RequirementStatus("canary_rollout", "10% traffic canary", "gap", canary_hits, canary_summary))

    session_labels = ("session_db", "trajectory_export")
    session_hits = tuple(label for label in session_labels if label in labels)
    session_status = "partial" if len(session_hits) == len(session_labels) else "gap"
    session_summary = (
        "Hermes already has SessionDB and trajectory export surfaces that can feed offline optimization data, "
        "but not a TensorZero-native ingestion path yet."
        if session_hits else
        "No session-data surface was found for prompt optimization."
    )
    matrix.append(RequirementStatus("session_feedback", "Session data for prompt optimization", session_status, session_hits, session_summary))

    eval_labels = ("benchmark_suite", "trajectory_export")
    eval_hits = tuple(label for label in eval_labels if label in labels)
    eval_status = "partial" if "benchmark_suite" in eval_hits else "gap"
    eval_summary = (
        "Hermes already has benchmark/trajectory machinery that can seed TensorZero A/B evaluation, "
        "but no integrated TensorZero experiment runner or live evaluation gateway."
        if eval_hits else
        "No evaluation harness was found to support TensorZero A/B testing."
    )
    matrix.append(RequirementStatus("evaluation_suite", "Evaluation suite / A/B testing", eval_status, eval_hits, eval_summary))

    return matrix


def build_report(touchpoints: list[Touchpoint], requirement_matrix: list[RequirementStatus]) -> EvaluationReport:
    recommendation = (
        "Not ready for direct replacement. Recommend a shadow-evaluation phase first: keep Hermes routing live, "
        "inventory the migration seams, export SessionDB/trajectory data into an offline TensorZero experiment loop, "
        "and only design a canary gateway once percentage-based rollout controls exist."
    )
    return EvaluationReport(
        issue_number=ISSUE_NUMBER,
        issue_title=ISSUE_TITLE,
        issue_url=ISSUE_URL,
        recommendation=recommendation,
        touchpoints=tuple(touchpoints),
        requirements=tuple(requirement_matrix),
    )


def build_markdown(report: EvaluationReport) -> str:
    lines: list[str] = []
    lines.append("# TensorZero Evaluation Packet")
    lines.append("")
    lines.append(f"Issue #{report.issue_number}: [{report.issue_title}]({report.issue_url})")
    lines.append("")
    lines.append("## Scope")
    lines.append("")
    lines.append("This packet evaluates TensorZero as a possible replacement for Hermes' custom provider-routing stack.")
    lines.append("It is intentionally grounded in the current repo state rather than a speculative cutover plan.")
    lines.append("")
    lines.append("## Issue requirements being evaluated")
    lines.append("")
    lines.append("- Deploy tensorzero gateway (Rust binary)")
    lines.append("- Migrate provider routing config")
    lines.append("- Test with canary (10% traffic) before full cutover")
    lines.append("- Feed session data for prompt optimization")
    lines.append("- Evaluation suite for A/B testing models")
    lines.append("")
    lines.append("## Recommendation")
    lines.append("")
    lines.append(report.recommendation)
    lines.append("")
    lines.append("## Requirement matrix")
    lines.append("")
    lines.append("| Requirement | Status | Evidence labels | Summary |")
    lines.append("| --- | --- | --- | --- |")
    for row in report.requirements:
        evidence = ", ".join(row.evidence_labels) if row.evidence_labels else "—"
        lines.append(f"| {row.name} | {row.status} | {evidence} | {row.summary} |")
    lines.append("")
    lines.append("## Grounded Hermes touchpoints")
    lines.append("")
    if report.touchpoints:
        for tp in report.touchpoints:
            lines.append(f"- `{tp.file_path}:{tp.line_number}` — [{tp.label}] {tp.matched_text}")
    else:
        lines.append("- No routing/evaluation touchpoints were found.")
    lines.append("")
    lines.append("## Suggested next slice")
    lines.append("")
    lines.append("1. Build an exporter that emits SessionDB + trajectory data into a TensorZero-friendly offline dataset.")
    lines.append("2. Define percentage-based canary controls before attempting any gateway replacement.")
    lines.append("3. Keep Hermes routing authoritative until TensorZero proves parity across CLI, gateway, cron, auxiliary, and delegation surfaces.")
    lines.append("")
    return "\n".join(lines).rstrip() + "\n"


def write_outputs(report: EvaluationReport, markdown_path: Path, json_path: Path | None = None) -> None:
    markdown_path.parent.mkdir(parents=True, exist_ok=True)
    markdown_path.write_text(build_markdown(report), encoding="utf-8")
    if json_path is not None:
        json_path.parent.mkdir(parents=True, exist_ok=True)
        json_path.write_text(json.dumps(asdict(report), indent=2), encoding="utf-8")


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Generate a grounded TensorZero evaluation packet for Hermes")
    parser.add_argument("--repo-root", default=".", help="Hermes repo root to scan")
    parser.add_argument("--output", default=str(DEFAULT_OUTPUT), help="Markdown output path")
    parser.add_argument("--json-output", default=str(DEFAULT_JSON_OUTPUT), help="Optional JSON output path")
    return parser.parse_args()


def main() -> int:
    args = parse_args()
    repo_root = Path(args.repo_root).resolve()
    touchpoints = scan_touchpoints(repo_root)
    matrix = build_requirement_matrix(touchpoints)
    report = build_report(touchpoints, matrix)
    json_output = Path(args.json_output) if args.json_output else None
    write_outputs(report, Path(args.output), json_output)
    print(f"Wrote {args.output}")
    if json_output is not None:
        print(f"Wrote {json_output}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())