turboquant/tests/tool_call_regression.py

#!/usr/bin/env python3
"""
TurboQuant Tool Call Regression Suite (Issue #96)

Verifies that TurboQuant-compressed models still handle hermes tool calling
correctly. Tests schema parsing, execution, and parallel tool calls.

Usage:
    python3 tests/tool_call_regression.py \
        --endpoint http://localhost:8081/v1 \
        --model gemma-4 \
        --kv-type turbo4 \
        --runs 3

    # Dry run (no server needed — validates schemas only):
    python3 tests/tool_call_regression.py --dry-run

Acceptance: tool call accuracy must be >= 95% across all test cases.
"""

import argparse
import json
import os
import re
import sys
import time
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from typing import Optional


# ── Tool schemas (hermes-compatible) ──────────────────────────────

TOOL_SCHEMAS = [
    {
        "type": "function",
        "function": {
            "name": "read_file",
            "description": "Read a text file with line numbers and pagination.",
            "parameters": {
                "type": "object",
                "properties": {
                    "path": {
                        "type": "string",
                        "description": "File path to read (absolute or relative)"
                    },
                    "offset": {
                        "type": "integer",
                        "description": "Line number to start reading from (1-indexed)",
                        "default": 1
                    },
                    "limit": {
                        "type": "integer",
                        "description": "Maximum number of lines to return",
                        "default": 500
                    }
                },
                "required": ["path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "web_search",
            "description": "Search the web for information using a query string.",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "Search query"
                    },
                    "num_results": {
                        "type": "integer",
                        "description": "Number of results to return",
                        "default": 5
                    }
                },
                "required": ["query"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "terminal",
            "description": "Execute a shell command on the system.",
            "parameters": {
                "type": "object",
                "properties": {
                    "command": {
                        "type": "string",
                        "description": "Shell command to execute"
                    },
                    "timeout": {
                        "type": "integer",
                        "description": "Timeout in seconds",
                        "default": 30
                    }
                },
                "required": ["command"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "execute_code",
            "description": "Run a Python script in a sandboxed environment.",
            "parameters": {
                "type": "object",
                "properties": {
                    "code": {
                        "type": "string",
                        "description": "Python code to execute"
                    }
                },
                "required": ["code"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "delegate_task",
            "description": "Spawn a subagent to work on a task in an isolated context.",
            "parameters": {
                "type": "object",
                "properties": {
                    "goal": {
                        "type": "string",
                        "description": "What the subagent should accomplish"
                    },
                    "context": {
                        "type": "string",
                        "description": "Background information the subagent needs"
                    },
                    "toolsets": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Toolsets to enable for this subagent"
                    }
                },
                "required": ["goal"]
            }
        }
    },
]


# ── Test prompts ──────────────────────────────────────────────────

@dataclass
class ToolCallTest:
    """A single test case for tool calling."""
    id: str
    name: str
    prompt: str
    expected_tool: str
    expected_args: dict  # subset of expected args
    description: str = ""


TEST_CASES = [
    ToolCallTest(
        id="read_file_basic",
        name="Read File — basic path",
        prompt="Read the file at /tmp/test.txt and show me the first 10 lines.",
        expected_tool="read_file",
        expected_args={"path": "/tmp/test.txt"},
        description="Basic file read with path argument",
    ),
    ToolCallTest(
        id="read_file_offset",
        name="Read File — with offset",
        prompt="Read lines 50 through 80 of /var/log/system.log",
        expected_tool="read_file",
        expected_args={"path": "/var/log/system.log"},
        description="File read with offset parameter",
    ),
    ToolCallTest(
        id="web_search_basic",
        name="Web Search — basic query",
        prompt="Search the web for 'TurboQuant KV cache compression benchmarks'",
        expected_tool="web_search",
        expected_args={"query": "turboquant"},
        description="Web search with query containing keywords",
    ),
    ToolCallTest(
        id="terminal_basic",
        name="Terminal — simple command",
        prompt="Run `ls -la /tmp` to see what files are there.",
        expected_tool="terminal",
        expected_args={"command": "ls"},
        description="Terminal command execution",
    ),
    ToolCallTest(
        id="terminal_complex",
        name="Terminal — complex command",
        prompt="Check the disk usage of the current directory with `du -sh .`",
        expected_tool="terminal",
        expected_args={"command": "du"},
        description="Terminal with different command",
    ),
    ToolCallTest(
        id="code_exec_basic",
        name="Code Execution — python",
        prompt="Run this Python code: print(sum(range(100)))",
        expected_tool="execute_code",
        expected_args={"code": "sum"},
        description="Code execution with Python",
    ),
    ToolCallTest(
        id="code_exec_complex",
        name="Code Execution — multi-line",
        prompt="Write and run Python code that reads a CSV file and counts the rows. Use the csv module.",
        expected_tool="execute_code",
        expected_args={"code": "csv"},
        description="Code execution with multi-line Python",
    ),
    ToolCallTest(
        id="delegate_basic",
        name="Delegate Task — simple",
        prompt="Delegate this task to a subagent: research the latest llama.cpp release notes.",
        expected_tool="delegate_task",
        expected_args={"goal": "llama"},
        description="Task delegation with goal",
    ),
    ToolCallTest(
        id="delegate_context",
        name="Delegate Task — with context",
        prompt="Spawn a subagent to review the Python files in /src. Context: look for security issues.",
        expected_tool="delegate_task",
        expected_args={"goal": "review"},
        description="Task delegation with context",
    ),
    ToolCallTest(
        id="parallel_two",
        name="Parallel Tools — two in one response",
        prompt="Read the file /etc/hostname AND check the current date by running `date`. Do both at the same time.",
        expected_tool="read_file",  # at least one of the two
        expected_args={"path": "/etc/hostname"},
        description="Two tool calls in a single response",
        # Note: this test checks that at least 2 tool calls are returned
    ),
]


# ── Result types ──────────────────────────────────────────────────

@dataclass
class TestResult:
    id: str
    name: str
    passed: bool
    tool_called: Optional[str] = None
    expected_tool: str = ""
    schema_valid: bool = False
    args_valid: bool = False
    latency_ms: float = 0.0
    raw_response: str = ""
    error: Optional[str] = None


@dataclass
class SuiteResult:
    timestamp: str
    model: str
    endpoint: str
    kv_type: str
    total: int = 0
    passed: int = 0
    failed: int = 0
    accuracy: float = 0.0
    meets_threshold: bool = False
    threshold: float = 0.95
    results: list = field(default_factory=list)
    error: Optional[str] = None


# ── Schema validation ────────────────────────────────────────────

def validate_tool_call_schema(call: dict) -> bool:
    """Validate that a tool call response has the expected structure."""
    if not isinstance(call, dict):
        return False

    # OpenAI format: { "type": "function", "function": { "name": "...", "arguments": "{}" } }
    if call.get("type") == "function":
        func = call.get("function", {})
        return (
            isinstance(func.get("name"), str) and len(func["name"]) > 0
            and isinstance(func.get("arguments"), str)
        )

    # Alternative format: { "name": "...", "arguments": "{}" }
    if "name" in call and "arguments" in call:
        return (
            isinstance(call["name"], str) and len(call["name"]) > 0
            and isinstance(call["arguments"], str)
        )

    return False


def validate_tool_args(args_str: str, expected: dict) -> bool:
    """Validate that tool arguments contain expected keys/values."""
    try:
        args = json.loads(args_str)
    except (json.JSONDecodeError, TypeError):
        return False

    if not isinstance(args, dict):
        return False

    for key, value in expected.items():
        if key not in args:
            return False
        # For string values, check substring match
        if isinstance(value, str) and isinstance(args[key], str):
            if value.lower() not in args[key].lower():
                return False
        # For non-string values, check exact match
        elif args[key] != value:
            return False

    return True


def extract_tool_calls(response: dict) -> list:
    """Extract tool calls from an API response."""
    choices = response.get("choices", [])
    if not choices:
        return []

    message = choices[0].get("message", {})

    # Standard OpenAI format
    tool_calls = message.get("tool_calls", [])
    if tool_calls:
        return tool_calls

    # Some models return tool calls in content as JSON
    content = message.get("content", "")
    if content:
        # Try to parse content as JSON tool call
        try:
            parsed = json.loads(content)
            if isinstance(parsed, dict) and "name" in parsed and "arguments" in parsed:
                return [parsed]
            if isinstance(parsed, list):
                return [c for c in parsed if isinstance(c, dict) and "name" in c]
        except (json.JSONDecodeError, TypeError):
            # Look for JSON blocks in content
            json_match = re.search(r'\{[^{}]*"name"\s*:\s*"[^"]*"[^{}]*\}', content)
            if json_match:
                try:
                    return [json.loads(json_match.group())]
                except json.JSONDecodeError:
                    pass

    return []


# ── API interaction ───────────────────────────────────────────────

def call_model(endpoint: str, model: str, messages: list, tools: list,
               temperature: float = 0.1, timeout: int = 60) -> dict:
    """Call the model via OpenAI-compatible API."""
    import urllib.request

    payload = json.dumps({
        "model": model,
        "messages": messages,
        "tools": tools,
        "temperature": temperature,
        "max_tokens": 1024,
    }).encode()

    req = urllib.request.Request(
        f"{endpoint}/chat/completions",
        data=payload,
        headers={"Content-Type": "application/json"},
        method="POST",
    )

    start = time.time()
    try:
        resp = urllib.request.urlopen(req, timeout=timeout)
        data = json.loads(resp.read())
        data["_latency_ms"] = (time.time() - start) * 1000
        return data
    except Exception as e:
        return {"error": str(e), "_latency_ms": (time.time() - start) * 1000}


# ── Test runner ───────────────────────────────────────────────────

def run_single_test(endpoint: str, model: str, test: ToolCallTest) -> TestResult:
    """Run a single tool call test."""
    messages = [
        {
            "role": "system",
            "content": (
                "You are a helpful assistant. When the user asks you to perform "
                "a task, use the appropriate tool. Always call exactly one tool "
                "unless the user explicitly asks for multiple things."
            ),
        },
        {"role": "user", "content": test.prompt},
    ]

    response = call_model(endpoint, model, messages, TOOL_SCHEMAS)

    if "error" in response:
        return TestResult(
            id=test.id,
            name=test.name,
            passed=False,
            expected_tool=test.expected_tool,
            error=response["error"],
            latency_ms=response.get("_latency_ms", 0),
        )

    tool_calls = extract_tool_calls(response)
    latency = response.get("_latency_ms", 0)

    if not tool_calls:
        # Model didn't call any tool
        content = response.get("choices", [{}])[0].get("message", {}).get("content", "")
        return TestResult(
            id=test.id,
            name=test.name,
            passed=False,
            expected_tool=test.expected_tool,
            latency_ms=latency,
            raw_response=content[:500],
            error="No tool call returned",
        )

    # Validate first tool call
    call = tool_calls[0]
    schema_valid = validate_tool_call_schema(call)

    # Extract tool name
    if call.get("type") == "function":
        tool_name = call["function"]["name"]
        args_str = call["function"]["arguments"]
    else:
        tool_name = call.get("name", "")
        args_str = call.get("arguments", "{}")

    args_valid = validate_tool_args(args_str, test.expected_args)
    tool_correct = tool_name == test.expected_tool
    passed = tool_correct and schema_valid

    return TestResult(
        id=test.id,
        name=test.name,
        passed=passed,
        tool_called=tool_name,
        expected_tool=test.expected_tool,
        schema_valid=schema_valid,
        args_valid=args_valid,
        latency_ms=latency,
        raw_response=json.dumps(tool_calls[:2])[:500],
    )


def run_dry_run() -> SuiteResult:
    """Validate schemas and test structure without a running server."""
    print("=== DRY RUN — Schema Validation Only ===\n")

    results = []
    for test in TEST_CASES:
        # Validate schemas parse
        schema_valid = True
        for tool in TOOL_SCHEMAS:
            try:
                assert "type" in tool
                assert tool["type"] == "function"
                func = tool["function"]
                assert "name" in func
                assert "description" in func
                assert "parameters" in func
                params = func["parameters"]
                assert "type" in params
                assert "properties" in params
            except AssertionError:
                schema_valid = False

        results.append(TestResult(
            id=test.id,
            name=test.name,
            passed=schema_valid,
            expected_tool=test.expected_tool,
            schema_valid=schema_valid,
            args_valid=True,
        ))

    passed = sum(1 for r in results if r.passed)
    suite = SuiteResult(
        timestamp=datetime.now(timezone.utc).isoformat(),
        model="dry-run",
        endpoint="none",
        kv_type="none",
        total=len(results),
        passed=passed,
        failed=len(results) - passed,
        accuracy=passed / len(results) if results else 0,
        meets_threshold=passed == len(results),
        threshold=1.0,
        results=[asdict(r) for r in results],
    )

    return suite


def run_suite(endpoint: str, model: str, kv_type: str, runs: int = 1,
              threshold: float = 0.95) -> SuiteResult:
    """Run the full tool call regression suite."""
    print(f"=== TurboQuant Tool Call Regression Suite ===")
    print(f"Endpoint: {endpoint}")
    print(f"Model:    {model}")
    print(f"KV Type:  {kv_type}")
    print(f"Runs:     {runs}")
    print(f"Threshold: {threshold:.0%}")
    print()

    # Check server is reachable
    try:
        import urllib.request
        health_req = urllib.request.Request(f"{endpoint}/models", method="GET")
        urllib.request.urlopen(health_req, timeout=5)
    except Exception as e:
        return SuiteResult(
            timestamp=datetime.now(timezone.utc).isoformat(),
            model=model,
            endpoint=endpoint,
            kv_type=kv_type,
            error=f"Server unreachable: {e}",
        )

    all_results = []
    for run_idx in range(runs):
        if runs > 1:
            print(f"\n--- Run {run_idx + 1}/{runs} ---")

        for test in TEST_CASES:
            print(f"  {test.id}: ", end="", flush=True)
            result = run_single_test(endpoint, model, test)
            status = "PASS" if result.passed else "FAIL"
            tool_info = f"called={result.tool_called}" if result.tool_called else "no tool"
            print(f"{status} ({tool_info}, {result.latency_ms:.0f}ms)")
            if result.error:
                print(f"         Error: {result.error}")
            all_results.append(result)

    passed = sum(1 for r in all_results if r.passed)
    total = len(all_results)
    accuracy = passed / total if total > 0 else 0

    suite = SuiteResult(
        timestamp=datetime.now(timezone.utc).isoformat(),
        model=model,
        endpoint=endpoint,
        kv_type=kv_type,
        total=total,
        passed=passed,
        failed=total - passed,
        accuracy=accuracy,
        meets_threshold=accuracy >= threshold,
        threshold=threshold,
        results=[asdict(r) for r in all_results],
    )

    print(f"\n{'='*60}")
    print(f"RESULTS: {passed}/{total} passed ({accuracy:.1%})")
    print(f"Threshold: {threshold:.0%}")
    print(f"VERDICT: {'PASS' if suite.meets_threshold else 'FAIL'}")
    print(f"{'='*60}")

    return suite


# ── Markdown report ───────────────────────────────────────────────

def generate_report(suite: SuiteResult, output_path: str) -> None:
    """Generate a markdown results matrix."""
    lines = [
        "# Tool Call Regression Results",
        "",
        f"**Generated:** {suite.timestamp}",
        f"**Model:** {suite.model}",
        f"**Endpoint:** {suite.endpoint}",
        f"**KV Type:** {suite.kv_type}",
        "",
        "## Summary",
        "",
        f"| Metric | Value |",
        f"|--------|-------|",
        f"| Total tests | {suite.total} |",
        f"| Passed | {suite.passed} |",
        f"| Failed | {suite.failed} |",
        f"| Accuracy | {suite.accuracy:.1%} |",
        f"| Threshold | {suite.threshold:.0%} |",
        f"| Verdict | {'PASS' if suite.meets_threshold else 'FAIL'} |",
        "",
        "## Test Matrix",
        "",
        "| Test ID | Tool Expected | Tool Called | Schema | Args | Latency | Status |",
        "|---------|--------------|-------------|--------|------|---------|--------|",
    ]

    for r in suite.results:
        d = r if isinstance(r, dict) else asdict(r)
        status = "PASS" if d["passed"] else "FAIL"
        schema = "OK" if d.get("schema_valid") else "FAIL"
        args = "OK" if d.get("args_valid") else "FAIL"
        called = d.get("tool_called") or "none"
        latency = f"{d.get('latency_ms', 0):.0f}ms"
        lines.append(
            f"| {d['id']} | {d['expected_tool']} | {called} | {schema} | {args} | {latency} | {status} |"
        )

    if suite.error:
        lines.extend(["", "## Error", "", suite.error])

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w") as f:
        f.write("\n".join(lines) + "\n")
    print(f"\nReport saved to {output_path}")


# ── Main ──────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(description="TurboQuant Tool Call Regression Suite")
    parser.add_argument("--endpoint", default="http://localhost:8081/v1",
                        help="llama.cpp OpenAI-compatible endpoint")
    parser.add_argument("--model", default="gemma-4", help="Model name")
    parser.add_argument("--kv-type", default="turbo4", help="KV cache type being tested")
    parser.add_argument("--runs", type=int, default=1, help="Number of runs per test")
    parser.add_argument("--threshold", type=float, default=0.95,
                        help="Minimum accuracy to pass (0.0-1.0)")
    parser.add_argument("--output", default="benchmarks/tool-call-regression.md",
                        help="Output markdown report path")
    parser.add_argument("--results-json", default="benchmarks/tool-call-regression.json",
                        help="Output JSON results path")
    parser.add_argument("--dry-run", action="store_true",
                        help="Validate schemas only, no server needed")
    args = parser.parse_args()

    if args.dry_run:
        suite = run_dry_run()
    else:
        suite = run_suite(
            endpoint=args.endpoint,
            model=args.model,
            kv_type=args.kv_type,
            runs=args.runs,
            threshold=args.threshold,
        )

    # Save results
    generate_report(suite, args.output)
    os.makedirs(os.path.dirname(args.results_json), exist_ok=True)
    with open(args.results_json, "w") as f:
        json.dump(asdict(suite), f, indent=2)
    print(f"JSON results saved to {args.results_json}")

    # Exit code: 0 if passes threshold, 1 otherwise
    sys.exit(0 if suite.meets_threshold else 1)


if __name__ == "__main__":
    main()