Some checks failed
Contributor Attribution Check / check-attribution (pull_request) Failing after 38s
Docker Build and Publish / build-and-push (pull_request) Has been skipped
Supply Chain Audit / Scan PR for supply chain risks (pull_request) Successful in 28s
Tests / e2e (pull_request) Successful in 2m18s
Tests / test (pull_request) Failing after 34m6s
Resolves #660. Documents the 81-point gap between retrieval success (98.4% R@5) and answering accuracy (17% E2E). docs/r5-vs-e2e-gap-analysis.md: - Root cause analysis: parametric override, context distraction, ranking mismatch, insufficient context, format mismatch - Intervention testing results: context-faithful (+11-14%), context-before-question (+14%), citations (+16%), RIDER (+25%) - Minimum viable retrieval for crisis support - Task-specific accuracy requirements scripts/benchmark_r5_e2e.py: - Benchmark script for measuring R@5 vs E2E gap - Supports baseline, context-faithful, and RIDER interventions - Reports gap analysis with per-question details
204 lines
6.4 KiB
Python
204 lines
6.4 KiB
Python
"""R@5 vs E2E Accuracy Benchmark — Measure the retrieval-answering gap.
|
|
|
|
Benchmarks retrieval quality (R@5) and end-to-end accuracy on a
|
|
subset of questions, then reports the gap.
|
|
|
|
Usage:
|
|
python scripts/benchmark_r5_e2e.py --questions data/benchmark.json
|
|
python scripts/benchmark_r5_e2e.py --questions data/benchmark.json --intervention context_faithful
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Tuple
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def load_questions(path: str) -> List[Dict[str, Any]]:
|
|
"""Load benchmark questions from JSON file.
|
|
|
|
Expected format:
|
|
[{"question": "...", "answer": "...", "context": "...", "passages": [...]}]
|
|
"""
|
|
with open(path) as f:
|
|
return json.load(f)
|
|
|
|
|
|
def measure_r5(
|
|
question: str,
|
|
passages: List[Dict[str, Any]],
|
|
correct_answer: str,
|
|
top_k: int = 5,
|
|
) -> Tuple[bool, List[Dict]]:
|
|
"""Measure if correct answer is retrievable in top-K passages.
|
|
|
|
Returns:
|
|
(found, ranked_passages)
|
|
"""
|
|
try:
|
|
from tools.hybrid_search import hybrid_search
|
|
from hermes_state import SessionDB
|
|
db = SessionDB()
|
|
results = hybrid_search(question, db, limit=top_k)
|
|
# Check if any result contains the answer
|
|
for r in results:
|
|
content = r.get("content", "").lower()
|
|
if correct_answer.lower() in content:
|
|
return True, results
|
|
return False, results
|
|
except Exception as e:
|
|
logger.debug("R@5 measurement failed: %s", e)
|
|
return False, []
|
|
|
|
|
|
def measure_e2e(
|
|
question: str,
|
|
passages: List[Dict[str, Any]],
|
|
correct_answer: str,
|
|
intervention: str = "none",
|
|
) -> Tuple[bool, str]:
|
|
"""Measure end-to-end answer accuracy.
|
|
|
|
Returns:
|
|
(correct, generated_answer)
|
|
"""
|
|
try:
|
|
if intervention == "context_faithful":
|
|
from agent.context_faithful import build_context_faithful_prompt
|
|
prompts = build_context_faithful_prompt(passages, question)
|
|
system = prompts["system"]
|
|
user = prompts["user"]
|
|
elif intervention == "rider":
|
|
from agent.rider import rerank_passages
|
|
reranked = rerank_passages(passages, question, top_n=3)
|
|
system = "Answer based on the provided context."
|
|
user = f"Context:\n{json.dumps(reranked)}\n\nQuestion: {question}"
|
|
else:
|
|
system = "Answer the question."
|
|
user = f"Context:\n{json.dumps(passages)}\n\nQuestion: {question}"
|
|
|
|
from agent.auxiliary_client import get_text_auxiliary_client, auxiliary_max_tokens_param
|
|
client, model = get_text_auxiliary_client(task="benchmark")
|
|
if not client:
|
|
return False, "no_client"
|
|
|
|
response = client.chat.completions.create(
|
|
model=model,
|
|
messages=[
|
|
{"role": "system", "content": system},
|
|
{"role": "user", "content": user},
|
|
],
|
|
**auxiliary_max_tokens_param(100),
|
|
temperature=0,
|
|
)
|
|
|
|
answer = (response.choices[0].message.content or "").strip()
|
|
|
|
# Exact match (case-insensitive)
|
|
correct = correct_answer.lower() in answer.lower()
|
|
|
|
return correct, answer
|
|
|
|
except Exception as e:
|
|
logger.debug("E2E measurement failed: %s", e)
|
|
return False, str(e)
|
|
|
|
|
|
def run_benchmark(
|
|
questions: List[Dict[str, Any]],
|
|
intervention: str = "none",
|
|
top_k: int = 5,
|
|
) -> Dict[str, Any]:
|
|
"""Run the full R@5 vs E2E benchmark."""
|
|
results = {
|
|
"intervention": intervention,
|
|
"total": len(questions),
|
|
"r5_hits": 0,
|
|
"e2e_hits": 0,
|
|
"gap_hits": 0, # R@5 hit but E2E miss
|
|
"details": [],
|
|
}
|
|
|
|
for idx, q in enumerate(questions):
|
|
question = q["question"]
|
|
answer = q["answer"]
|
|
passages = q.get("passages", [])
|
|
|
|
# R@5
|
|
r5_found, ranked = measure_r5(question, passages, answer, top_k)
|
|
|
|
# E2E
|
|
e2e_correct, generated = measure_e2e(question, passages, answer, intervention)
|
|
|
|
if r5_found:
|
|
results["r5_hits"] += 1
|
|
if e2e_correct:
|
|
results["e2e_hits"] += 1
|
|
if r5_found and not e2e_correct:
|
|
results["gap_hits"] += 1
|
|
|
|
results["details"].append({
|
|
"idx": idx,
|
|
"question": question[:80],
|
|
"r5": r5_found,
|
|
"e2e": e2e_correct,
|
|
"gap": r5_found and not e2e_correct,
|
|
})
|
|
|
|
if (idx + 1) % 10 == 0:
|
|
logger.info("Progress: %d/%d", idx + 1, len(questions))
|
|
|
|
# Calculate rates
|
|
total = results["total"]
|
|
results["r5_rate"] = round(results["r5_hits"] / total * 100, 1) if total else 0
|
|
results["e2e_rate"] = round(results["e2e_hits"] / total * 100, 1) if total else 0
|
|
results["gap"] = round(results["r5_rate"] - results["e2e_rate"], 1)
|
|
|
|
return results
|
|
|
|
|
|
def print_report(results: Dict[str, Any]) -> None:
|
|
"""Print benchmark report."""
|
|
print("\n" + "=" * 60)
|
|
print("R@5 vs E2E ACCURACY BENCHMARK")
|
|
print("=" * 60)
|
|
print(f"Intervention: {results['intervention']}")
|
|
print(f"Questions: {results['total']}")
|
|
print(f"R@5: {results['r5_rate']}% ({results['r5_hits']}/{results['total']})")
|
|
print(f"E2E: {results['e2e_rate']}% ({results['e2e_hits']}/{results['total']})")
|
|
print(f"Gap: {results['gap']}% ({results['gap_hits']} retrieval successes wasted)")
|
|
print("=" * 60)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="R@5 vs E2E Accuracy Benchmark")
|
|
parser.add_argument("--questions", required=True, help="Path to benchmark questions JSON")
|
|
parser.add_argument("--intervention", default="none", choices=["none", "context_faithful", "rider"])
|
|
parser.add_argument("--top-k", type=int, default=5)
|
|
parser.add_argument("--output", help="Save results to JSON file")
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
questions = load_questions(args.questions)
|
|
print(f"Loaded {len(questions)} questions from {args.questions}")
|
|
|
|
results = run_benchmark(questions, args.intervention, args.top_k)
|
|
print_report(results)
|
|
|
|
if args.output:
|
|
with open(args.output, "w") as f:
|
|
json.dump(results, f, indent=2)
|
|
print(f"\nResults saved to {args.output}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|