198 lines
6.1 KiB
Python
198 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
|
"""Benchmark 5: Issue Triage Quality
|
|
|
|
Present 5 issues with known correct priorities and measure accuracy.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
import sys
|
|
import time
|
|
|
|
import requests
|
|
|
|
OLLAMA_URL = "http://localhost:11434"
|
|
|
|
TRIAGE_PROMPT_TEMPLATE = """\
|
|
You are a software project triage agent. Assign a priority to the following issue.
|
|
|
|
Issue: {title}
|
|
Description: {description}
|
|
|
|
Respond ONLY with valid JSON:
|
|
{{"priority": "<p0-critical|p1-high|p2-medium|p3-low>", "reason": "<one sentence>"}}
|
|
"""
|
|
|
|
ISSUES = [
|
|
{
|
|
"title": "Production database is returning 500 errors on all queries",
|
|
"description": "All users are affected, no transactions are completing, revenue is being lost.",
|
|
"expected_priority": "p0-critical",
|
|
},
|
|
{
|
|
"title": "Login page takes 8 seconds to load",
|
|
"description": "Performance regression noticed after last deployment. Users are complaining but can still log in.",
|
|
"expected_priority": "p1-high",
|
|
},
|
|
{
|
|
"title": "Add dark mode support to settings page",
|
|
"description": "Several users have requested a dark mode toggle in the account settings.",
|
|
"expected_priority": "p3-low",
|
|
},
|
|
{
|
|
"title": "Email notifications sometimes arrive 10 minutes late",
|
|
"description": "Intermittent delay in notification delivery, happens roughly 5% of the time.",
|
|
"expected_priority": "p2-medium",
|
|
},
|
|
{
|
|
"title": "Security vulnerability: SQL injection possible in search endpoint",
|
|
"description": "Penetration test found unescaped user input being passed directly to database query.",
|
|
"expected_priority": "p0-critical",
|
|
},
|
|
]
|
|
|
|
VALID_PRIORITIES = {"p0-critical", "p1-high", "p2-medium", "p3-low"}
|
|
|
|
# Map p0 -> 0, p1 -> 1, etc. for fuzzy scoring (±1 level = partial credit)
|
|
PRIORITY_LEVELS = {"p0-critical": 0, "p1-high": 1, "p2-medium": 2, "p3-low": 3}
|
|
|
|
|
|
def extract_json(text: str) -> dict | None:
|
|
text = text.strip()
|
|
try:
|
|
return json.loads(text)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
fence_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
|
|
if fence_match:
|
|
try:
|
|
return json.loads(fence_match.group(1))
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
brace_match = re.search(r"\{[^{}]*\}", text, re.DOTALL)
|
|
if brace_match:
|
|
try:
|
|
return json.loads(brace_match.group(0))
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
return None
|
|
|
|
|
|
def normalize_priority(raw: str) -> str | None:
|
|
"""Normalize various priority formats to canonical form."""
|
|
raw = raw.lower().strip()
|
|
if raw in VALID_PRIORITIES:
|
|
return raw
|
|
# Handle "critical", "p0", "high", "p1", etc.
|
|
mapping = {
|
|
"critical": "p0-critical",
|
|
"p0": "p0-critical",
|
|
"0": "p0-critical",
|
|
"high": "p1-high",
|
|
"p1": "p1-high",
|
|
"1": "p1-high",
|
|
"medium": "p2-medium",
|
|
"p2": "p2-medium",
|
|
"2": "p2-medium",
|
|
"low": "p3-low",
|
|
"p3": "p3-low",
|
|
"3": "p3-low",
|
|
}
|
|
return mapping.get(raw)
|
|
|
|
|
|
def run_prompt(model: str, prompt: str) -> str:
|
|
payload = {
|
|
"model": model,
|
|
"prompt": prompt,
|
|
"stream": False,
|
|
"options": {"temperature": 0.1, "num_predict": 256},
|
|
}
|
|
resp = requests.post(f"{OLLAMA_URL}/api/generate", json=payload, timeout=120)
|
|
resp.raise_for_status()
|
|
return resp.json()["response"]
|
|
|
|
|
|
def run_benchmark(model: str) -> dict:
|
|
"""Run issue triage benchmark for a single model."""
|
|
results = []
|
|
total_time = 0.0
|
|
|
|
for i, issue in enumerate(ISSUES, 1):
|
|
prompt = TRIAGE_PROMPT_TEMPLATE.format(
|
|
title=issue["title"], description=issue["description"]
|
|
)
|
|
start = time.time()
|
|
try:
|
|
raw = run_prompt(model, prompt)
|
|
elapsed = time.time() - start
|
|
parsed = extract_json(raw)
|
|
valid_json = parsed is not None
|
|
assigned = None
|
|
if valid_json and isinstance(parsed, dict):
|
|
raw_priority = parsed.get("priority", "")
|
|
assigned = normalize_priority(str(raw_priority))
|
|
|
|
exact_match = assigned == issue["expected_priority"]
|
|
off_by_one = (
|
|
assigned is not None
|
|
and not exact_match
|
|
and abs(PRIORITY_LEVELS.get(assigned, -1) - PRIORITY_LEVELS[issue["expected_priority"]]) == 1
|
|
)
|
|
|
|
results.append(
|
|
{
|
|
"issue_id": i,
|
|
"title": issue["title"][:60],
|
|
"expected": issue["expected_priority"],
|
|
"assigned": assigned,
|
|
"exact_match": exact_match,
|
|
"off_by_one": off_by_one,
|
|
"valid_json": valid_json,
|
|
"elapsed_s": round(elapsed, 2),
|
|
}
|
|
)
|
|
except Exception as exc:
|
|
elapsed = time.time() - start
|
|
results.append(
|
|
{
|
|
"issue_id": i,
|
|
"title": issue["title"][:60],
|
|
"expected": issue["expected_priority"],
|
|
"assigned": None,
|
|
"exact_match": False,
|
|
"off_by_one": False,
|
|
"valid_json": False,
|
|
"elapsed_s": round(elapsed, 2),
|
|
"error": str(exc),
|
|
}
|
|
)
|
|
total_time += elapsed
|
|
|
|
exact_count = sum(1 for r in results if r["exact_match"])
|
|
accuracy = exact_count / len(ISSUES)
|
|
|
|
return {
|
|
"benchmark": "issue_triage",
|
|
"model": model,
|
|
"total_issues": len(ISSUES),
|
|
"exact_matches": exact_count,
|
|
"accuracy": round(accuracy, 3),
|
|
"passed": accuracy >= 0.80,
|
|
"total_time_s": round(total_time, 2),
|
|
"results": results,
|
|
}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
model = sys.argv[1] if len(sys.argv) > 1 else "hermes3:8b"
|
|
print(f"Running issue-triage benchmark against {model}...")
|
|
result = run_benchmark(model)
|
|
print(json.dumps(result, indent=2))
|
|
sys.exit(0 if result["passed"] else 1)
|