#!/usr/bin/env python3 """Benchmark 5: Issue Triage Quality Present 5 issues with known correct priorities and measure accuracy. """ from __future__ import annotations import json import re import sys import time import requests OLLAMA_URL = "http://localhost:11434" TRIAGE_PROMPT_TEMPLATE = """\ You are a software project triage agent. Assign a priority to the following issue. Issue: {title} Description: {description} Respond ONLY with valid JSON: {{"priority": "", "reason": ""}} """ ISSUES = [ { "title": "Production database is returning 500 errors on all queries", "description": "All users are affected, no transactions are completing, revenue is being lost.", "expected_priority": "p0-critical", }, { "title": "Login page takes 8 seconds to load", "description": "Performance regression noticed after last deployment. Users are complaining but can still log in.", "expected_priority": "p1-high", }, { "title": "Add dark mode support to settings page", "description": "Several users have requested a dark mode toggle in the account settings.", "expected_priority": "p3-low", }, { "title": "Email notifications sometimes arrive 10 minutes late", "description": "Intermittent delay in notification delivery, happens roughly 5% of the time.", "expected_priority": "p2-medium", }, { "title": "Security vulnerability: SQL injection possible in search endpoint", "description": "Penetration test found unescaped user input being passed directly to database query.", "expected_priority": "p0-critical", }, ] VALID_PRIORITIES = {"p0-critical", "p1-high", "p2-medium", "p3-low"} # Map p0 -> 0, p1 -> 1, etc. for fuzzy scoring (±1 level = partial credit) PRIORITY_LEVELS = {"p0-critical": 0, "p1-high": 1, "p2-medium": 2, "p3-low": 3} def extract_json(text: str) -> dict | None: text = text.strip() try: return json.loads(text) except json.JSONDecodeError: pass fence_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL) if fence_match: try: return json.loads(fence_match.group(1)) except json.JSONDecodeError: pass brace_match = re.search(r"\{[^{}]*\}", text, re.DOTALL) if brace_match: try: return json.loads(brace_match.group(0)) except json.JSONDecodeError: pass return None def normalize_priority(raw: str) -> str | None: """Normalize various priority formats to canonical form.""" raw = raw.lower().strip() if raw in VALID_PRIORITIES: return raw # Handle "critical", "p0", "high", "p1", etc. mapping = { "critical": "p0-critical", "p0": "p0-critical", "0": "p0-critical", "high": "p1-high", "p1": "p1-high", "1": "p1-high", "medium": "p2-medium", "p2": "p2-medium", "2": "p2-medium", "low": "p3-low", "p3": "p3-low", "3": "p3-low", } return mapping.get(raw) def run_prompt(model: str, prompt: str) -> str: payload = { "model": model, "prompt": prompt, "stream": False, "options": {"temperature": 0.1, "num_predict": 256}, } resp = requests.post(f"{OLLAMA_URL}/api/generate", json=payload, timeout=120) resp.raise_for_status() return resp.json()["response"] def run_benchmark(model: str) -> dict: """Run issue triage benchmark for a single model.""" results = [] total_time = 0.0 for i, issue in enumerate(ISSUES, 1): prompt = TRIAGE_PROMPT_TEMPLATE.format( title=issue["title"], description=issue["description"] ) start = time.time() try: raw = run_prompt(model, prompt) elapsed = time.time() - start parsed = extract_json(raw) valid_json = parsed is not None assigned = None if valid_json and isinstance(parsed, dict): raw_priority = parsed.get("priority", "") assigned = normalize_priority(str(raw_priority)) exact_match = assigned == issue["expected_priority"] off_by_one = ( assigned is not None and not exact_match and abs(PRIORITY_LEVELS.get(assigned, -1) - PRIORITY_LEVELS[issue["expected_priority"]]) == 1 ) results.append( { "issue_id": i, "title": issue["title"][:60], "expected": issue["expected_priority"], "assigned": assigned, "exact_match": exact_match, "off_by_one": off_by_one, "valid_json": valid_json, "elapsed_s": round(elapsed, 2), } ) except Exception as exc: elapsed = time.time() - start results.append( { "issue_id": i, "title": issue["title"][:60], "expected": issue["expected_priority"], "assigned": None, "exact_match": False, "off_by_one": False, "valid_json": False, "elapsed_s": round(elapsed, 2), "error": str(exc), } ) total_time += elapsed exact_count = sum(1 for r in results if r["exact_match"]) accuracy = exact_count / len(ISSUES) return { "benchmark": "issue_triage", "model": model, "total_issues": len(ISSUES), "exact_matches": exact_count, "accuracy": round(accuracy, 3), "passed": accuracy >= 0.80, "total_time_s": round(total_time, 2), "results": results, } if __name__ == "__main__": model = sys.argv[1] if len(sys.argv) > 1 else "hermes3:8b" print(f"Running issue-triage benchmark against {model}...") result = run_benchmark(model) print(json.dumps(result, indent=2)) sys.exit(0 if result["passed"] else 1)