Timmy-time-dashboard/scripts/benchmarks/05_issue_triage.py

#!/usr/bin/env python3
"""Benchmark 5: Issue Triage Quality

Present 5 issues with known correct priorities and measure accuracy.
"""

from __future__ import annotations

import json
import re
import sys
import time

import requests

OLLAMA_URL = "http://localhost:11434"

TRIAGE_PROMPT_TEMPLATE = """\
You are a software project triage agent. Assign a priority to the following issue.

Issue: {title}
Description: {description}

Respond ONLY with valid JSON:
{{"priority": "<p0-critical|p1-high|p2-medium|p3-low>", "reason": "<one sentence>"}}
"""

ISSUES = [
    {
        "title": "Production database is returning 500 errors on all queries",
        "description": "All users are affected, no transactions are completing, revenue is being lost.",
        "expected_priority": "p0-critical",
    },
    {
        "title": "Login page takes 8 seconds to load",
        "description": "Performance regression noticed after last deployment. Users are complaining but can still log in.",
        "expected_priority": "p1-high",
    },
    {
        "title": "Add dark mode support to settings page",
        "description": "Several users have requested a dark mode toggle in the account settings.",
        "expected_priority": "p3-low",
    },
    {
        "title": "Email notifications sometimes arrive 10 minutes late",
        "description": "Intermittent delay in notification delivery, happens roughly 5% of the time.",
        "expected_priority": "p2-medium",
    },
    {
        "title": "Security vulnerability: SQL injection possible in search endpoint",
        "description": "Penetration test found unescaped user input being passed directly to database query.",
        "expected_priority": "p0-critical",
    },
]

VALID_PRIORITIES = {"p0-critical", "p1-high", "p2-medium", "p3-low"}

# Map p0 -> 0, p1 -> 1, etc. for fuzzy scoring (±1 level = partial credit)
PRIORITY_LEVELS = {"p0-critical": 0, "p1-high": 1, "p2-medium": 2, "p3-low": 3}


def extract_json(text: str) -> dict | None:
    text = text.strip()
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass

    fence_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
    if fence_match:
        try:
            return json.loads(fence_match.group(1))
        except json.JSONDecodeError:
            pass

    brace_match = re.search(r"\{[^{}]*\}", text, re.DOTALL)
    if brace_match:
        try:
            return json.loads(brace_match.group(0))
        except json.JSONDecodeError:
            pass

    return None


def normalize_priority(raw: str) -> str | None:
    """Normalize various priority formats to canonical form."""
    raw = raw.lower().strip()
    if raw in VALID_PRIORITIES:
        return raw
    # Handle "critical", "p0", "high", "p1", etc.
    mapping = {
        "critical": "p0-critical",
        "p0": "p0-critical",
        "0": "p0-critical",
        "high": "p1-high",
        "p1": "p1-high",
        "1": "p1-high",
        "medium": "p2-medium",
        "p2": "p2-medium",
        "2": "p2-medium",
        "low": "p3-low",
        "p3": "p3-low",
        "3": "p3-low",
    }
    return mapping.get(raw)


def run_prompt(model: str, prompt: str) -> str:
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "options": {"temperature": 0.1, "num_predict": 256},
    }
    resp = requests.post(f"{OLLAMA_URL}/api/generate", json=payload, timeout=120)
    resp.raise_for_status()
    return resp.json()["response"]


def run_benchmark(model: str) -> dict:
    """Run issue triage benchmark for a single model."""
    results = []
    total_time = 0.0

    for i, issue in enumerate(ISSUES, 1):
        prompt = TRIAGE_PROMPT_TEMPLATE.format(
            title=issue["title"], description=issue["description"]
        )
        start = time.time()
        try:
            raw = run_prompt(model, prompt)
            elapsed = time.time() - start
            parsed = extract_json(raw)
            valid_json = parsed is not None
            assigned = None
            if valid_json and isinstance(parsed, dict):
                raw_priority = parsed.get("priority", "")
                assigned = normalize_priority(str(raw_priority))

            exact_match = assigned == issue["expected_priority"]
            off_by_one = (
                assigned is not None
                and not exact_match
                and abs(PRIORITY_LEVELS.get(assigned, -1) - PRIORITY_LEVELS[issue["expected_priority"]]) == 1
            )

            results.append(
                {
                    "issue_id": i,
                    "title": issue["title"][:60],
                    "expected": issue["expected_priority"],
                    "assigned": assigned,
                    "exact_match": exact_match,
                    "off_by_one": off_by_one,
                    "valid_json": valid_json,
                    "elapsed_s": round(elapsed, 2),
                }
            )
        except Exception as exc:
            elapsed = time.time() - start
            results.append(
                {
                    "issue_id": i,
                    "title": issue["title"][:60],
                    "expected": issue["expected_priority"],
                    "assigned": None,
                    "exact_match": False,
                    "off_by_one": False,
                    "valid_json": False,
                    "elapsed_s": round(elapsed, 2),
                    "error": str(exc),
                }
            )
        total_time += elapsed

    exact_count = sum(1 for r in results if r["exact_match"])
    accuracy = exact_count / len(ISSUES)

    return {
        "benchmark": "issue_triage",
        "model": model,
        "total_issues": len(ISSUES),
        "exact_matches": exact_count,
        "accuracy": round(accuracy, 3),
        "passed": accuracy >= 0.80,
        "total_time_s": round(total_time, 2),
        "results": results,
    }


if __name__ == "__main__":
    model = sys.argv[1] if len(sys.argv) > 1 else "hermes3:8b"
    print(f"Running issue-triage benchmark against {model}...")
    result = run_benchmark(model)
    print(json.dumps(result, indent=2))
    sys.exit(0 if result["passed"] else 1)