hermes-agent/scripts/benchmark_r5_e2e.py

"""R@5 vs E2E Accuracy Benchmark — Measure the retrieval-answering gap.

Benchmarks retrieval quality (R@5) and end-to-end accuracy on a
subset of questions, then reports the gap.

Usage:
    python scripts/benchmark_r5_e2e.py --questions data/benchmark.json
    python scripts/benchmark_r5_e2e.py --questions data/benchmark.json --intervention context_faithful
"""

from __future__ import annotations

import argparse
import json
import logging
import sys
import time
from pathlib import Path
from typing import Any, Dict, List, Tuple

logger = logging.getLogger(__name__)


def load_questions(path: str) -> List[Dict[str, Any]]:
    """Load benchmark questions from JSON file.

    Expected format:
    [{"question": "...", "answer": "...", "context": "...", "passages": [...]}]
    """
    with open(path) as f:
        return json.load(f)


def measure_r5(
    question: str,
    passages: List[Dict[str, Any]],
    correct_answer: str,
    top_k: int = 5,
) -> Tuple[bool, List[Dict]]:
    """Measure if correct answer is retrievable in top-K passages.

    Returns:
        (found, ranked_passages)
    """
    try:
        from tools.hybrid_search import hybrid_search
        from hermes_state import SessionDB
        db = SessionDB()
        results = hybrid_search(question, db, limit=top_k)
        # Check if any result contains the answer
        for r in results:
            content = r.get("content", "").lower()
            if correct_answer.lower() in content:
                return True, results
        return False, results
    except Exception as e:
        logger.debug("R@5 measurement failed: %s", e)
        return False, []


def measure_e2e(
    question: str,
    passages: List[Dict[str, Any]],
    correct_answer: str,
    intervention: str = "none",
) -> Tuple[bool, str]:
    """Measure end-to-end answer accuracy.

    Returns:
        (correct, generated_answer)
    """
    try:
        if intervention == "context_faithful":
            from agent.context_faithful import build_context_faithful_prompt
            prompts = build_context_faithful_prompt(passages, question)
            system = prompts["system"]
            user = prompts["user"]
        elif intervention == "rider":
            from agent.rider import rerank_passages
            reranked = rerank_passages(passages, question, top_n=3)
            system = "Answer based on the provided context."
            user = f"Context:\n{json.dumps(reranked)}\n\nQuestion: {question}"
        else:
            system = "Answer the question."
            user = f"Context:\n{json.dumps(passages)}\n\nQuestion: {question}"

        from agent.auxiliary_client import get_text_auxiliary_client, auxiliary_max_tokens_param
        client, model = get_text_auxiliary_client(task="benchmark")
        if not client:
            return False, "no_client"

        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": user},
            ],
            **auxiliary_max_tokens_param(100),
            temperature=0,
        )

        answer = (response.choices[0].message.content or "").strip()

        # Exact match (case-insensitive)
        correct = correct_answer.lower() in answer.lower()

        return correct, answer

    except Exception as e:
        logger.debug("E2E measurement failed: %s", e)
        return False, str(e)


def run_benchmark(
    questions: List[Dict[str, Any]],
    intervention: str = "none",
    top_k: int = 5,
) -> Dict[str, Any]:
    """Run the full R@5 vs E2E benchmark."""
    results = {
        "intervention": intervention,
        "total": len(questions),
        "r5_hits": 0,
        "e2e_hits": 0,
        "gap_hits": 0,  # R@5 hit but E2E miss
        "details": [],
    }

    for idx, q in enumerate(questions):
        question = q["question"]
        answer = q["answer"]
        passages = q.get("passages", [])

        # R@5
        r5_found, ranked = measure_r5(question, passages, answer, top_k)

        # E2E
        e2e_correct, generated = measure_e2e(question, passages, answer, intervention)

        if r5_found:
            results["r5_hits"] += 1
        if e2e_correct:
            results["e2e_hits"] += 1
        if r5_found and not e2e_correct:
            results["gap_hits"] += 1

        results["details"].append({
            "idx": idx,
            "question": question[:80],
            "r5": r5_found,
            "e2e": e2e_correct,
            "gap": r5_found and not e2e_correct,
        })

        if (idx + 1) % 10 == 0:
            logger.info("Progress: %d/%d", idx + 1, len(questions))

    # Calculate rates
    total = results["total"]
    results["r5_rate"] = round(results["r5_hits"] / total * 100, 1) if total else 0
    results["e2e_rate"] = round(results["e2e_hits"] / total * 100, 1) if total else 0
    results["gap"] = round(results["r5_rate"] - results["e2e_rate"], 1)

    return results


def print_report(results: Dict[str, Any]) -> None:
    """Print benchmark report."""
    print("\n" + "=" * 60)
    print("R@5 vs E2E ACCURACY BENCHMARK")
    print("=" * 60)
    print(f"Intervention:  {results['intervention']}")
    print(f"Questions:     {results['total']}")
    print(f"R@5:           {results['r5_rate']}% ({results['r5_hits']}/{results['total']})")
    print(f"E2E:           {results['e2e_rate']}% ({results['e2e_hits']}/{results['total']})")
    print(f"Gap:           {results['gap']}% ({results['gap_hits']} retrieval successes wasted)")
    print("=" * 60)


def main():
    parser = argparse.ArgumentParser(description="R@5 vs E2E Accuracy Benchmark")
    parser.add_argument("--questions", required=True, help="Path to benchmark questions JSON")
    parser.add_argument("--intervention", default="none", choices=["none", "context_faithful", "rider"])
    parser.add_argument("--top-k", type=int, default=5)
    parser.add_argument("--output", help="Save results to JSON file")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)

    questions = load_questions(args.questions)
    print(f"Loaded {len(questions)} questions from {args.questions}")

    results = run_benchmark(questions, args.intervention, args.top_k)
    print_report(results)

    if args.output:
        with open(args.output, "w") as f:
            json.dump(results, f, indent=2)
        print(f"\nResults saved to {args.output}")


if __name__ == "__main__":
    main()