Compare commits
1 Commits
step35/55-
...
burn/96-17
| Author | SHA1 | Date | |
|---|---|---|---|
| 02c0cc2b23 |
@@ -22,3 +22,7 @@ jobs:
|
||||
run: |
|
||||
if grep -rE 'sk-or-|sk-ant-|ghp_|AKIA' . --include='*.yml' --include='*.py' --include='*.sh' 2>/dev/null | grep -v .gitea | grep -v llama-cpp-fork; then exit 1; fi
|
||||
echo "PASS: No secrets"
|
||||
- name: Tool call regression (schema validation)
|
||||
run: |
|
||||
python3 tests/tool_call_regression.py --dry-run
|
||||
echo "PASS: Tool call schemas valid"
|
||||
|
||||
135
benchmarks/tool-call-regression.json
Normal file
135
benchmarks/tool-call-regression.json
Normal file
@@ -0,0 +1,135 @@
|
||||
{
|
||||
"timestamp": "2026-04-16T01:56:48.462512+00:00",
|
||||
"model": "dry-run",
|
||||
"endpoint": "none",
|
||||
"kv_type": "none",
|
||||
"total": 10,
|
||||
"passed": 10,
|
||||
"failed": 0,
|
||||
"accuracy": 1.0,
|
||||
"meets_threshold": true,
|
||||
"threshold": 1.0,
|
||||
"results": [
|
||||
{
|
||||
"id": "read_file_basic",
|
||||
"name": "Read File \u2014 basic path",
|
||||
"passed": true,
|
||||
"tool_called": null,
|
||||
"expected_tool": "read_file",
|
||||
"schema_valid": true,
|
||||
"args_valid": true,
|
||||
"latency_ms": 0.0,
|
||||
"raw_response": "",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"id": "read_file_offset",
|
||||
"name": "Read File \u2014 with offset",
|
||||
"passed": true,
|
||||
"tool_called": null,
|
||||
"expected_tool": "read_file",
|
||||
"schema_valid": true,
|
||||
"args_valid": true,
|
||||
"latency_ms": 0.0,
|
||||
"raw_response": "",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"id": "web_search_basic",
|
||||
"name": "Web Search \u2014 basic query",
|
||||
"passed": true,
|
||||
"tool_called": null,
|
||||
"expected_tool": "web_search",
|
||||
"schema_valid": true,
|
||||
"args_valid": true,
|
||||
"latency_ms": 0.0,
|
||||
"raw_response": "",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"id": "terminal_basic",
|
||||
"name": "Terminal \u2014 simple command",
|
||||
"passed": true,
|
||||
"tool_called": null,
|
||||
"expected_tool": "terminal",
|
||||
"schema_valid": true,
|
||||
"args_valid": true,
|
||||
"latency_ms": 0.0,
|
||||
"raw_response": "",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"id": "terminal_complex",
|
||||
"name": "Terminal \u2014 complex command",
|
||||
"passed": true,
|
||||
"tool_called": null,
|
||||
"expected_tool": "terminal",
|
||||
"schema_valid": true,
|
||||
"args_valid": true,
|
||||
"latency_ms": 0.0,
|
||||
"raw_response": "",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"id": "code_exec_basic",
|
||||
"name": "Code Execution \u2014 python",
|
||||
"passed": true,
|
||||
"tool_called": null,
|
||||
"expected_tool": "execute_code",
|
||||
"schema_valid": true,
|
||||
"args_valid": true,
|
||||
"latency_ms": 0.0,
|
||||
"raw_response": "",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"id": "code_exec_complex",
|
||||
"name": "Code Execution \u2014 multi-line",
|
||||
"passed": true,
|
||||
"tool_called": null,
|
||||
"expected_tool": "execute_code",
|
||||
"schema_valid": true,
|
||||
"args_valid": true,
|
||||
"latency_ms": 0.0,
|
||||
"raw_response": "",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"id": "delegate_basic",
|
||||
"name": "Delegate Task \u2014 simple",
|
||||
"passed": true,
|
||||
"tool_called": null,
|
||||
"expected_tool": "delegate_task",
|
||||
"schema_valid": true,
|
||||
"args_valid": true,
|
||||
"latency_ms": 0.0,
|
||||
"raw_response": "",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"id": "delegate_context",
|
||||
"name": "Delegate Task \u2014 with context",
|
||||
"passed": true,
|
||||
"tool_called": null,
|
||||
"expected_tool": "delegate_task",
|
||||
"schema_valid": true,
|
||||
"args_valid": true,
|
||||
"latency_ms": 0.0,
|
||||
"raw_response": "",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"id": "parallel_two",
|
||||
"name": "Parallel Tools \u2014 two in one response",
|
||||
"passed": true,
|
||||
"tool_called": null,
|
||||
"expected_tool": "read_file",
|
||||
"schema_valid": true,
|
||||
"args_valid": true,
|
||||
"latency_ms": 0.0,
|
||||
"raw_response": "",
|
||||
"error": null
|
||||
}
|
||||
],
|
||||
"error": null
|
||||
}
|
||||
32
benchmarks/tool-call-regression.md
Normal file
32
benchmarks/tool-call-regression.md
Normal file
@@ -0,0 +1,32 @@
|
||||
# Tool Call Regression Results
|
||||
|
||||
**Generated:** 2026-04-16T01:56:48.462512+00:00
|
||||
**Model:** dry-run
|
||||
**Endpoint:** none
|
||||
**KV Type:** none
|
||||
|
||||
## Summary
|
||||
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| Total tests | 10 |
|
||||
| Passed | 10 |
|
||||
| Failed | 0 |
|
||||
| Accuracy | 100.0% |
|
||||
| Threshold | 100% |
|
||||
| Verdict | PASS |
|
||||
|
||||
## Test Matrix
|
||||
|
||||
| Test ID | Tool Expected | Tool Called | Schema | Args | Latency | Status |
|
||||
|---------|--------------|-------------|--------|------|---------|--------|
|
||||
| read_file_basic | read_file | none | OK | OK | 0ms | PASS |
|
||||
| read_file_offset | read_file | none | OK | OK | 0ms | PASS |
|
||||
| web_search_basic | web_search | none | OK | OK | 0ms | PASS |
|
||||
| terminal_basic | terminal | none | OK | OK | 0ms | PASS |
|
||||
| terminal_complex | terminal | none | OK | OK | 0ms | PASS |
|
||||
| code_exec_basic | execute_code | none | OK | OK | 0ms | PASS |
|
||||
| code_exec_complex | execute_code | none | OK | OK | 0ms | PASS |
|
||||
| delegate_basic | delegate_task | none | OK | OK | 0ms | PASS |
|
||||
| delegate_context | delegate_task | none | OK | OK | 0ms | PASS |
|
||||
| parallel_two | read_file | none | OK | OK | 0ms | PASS |
|
||||
678
tests/tool_call_regression.py
Normal file
678
tests/tool_call_regression.py
Normal file
@@ -0,0 +1,678 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
TurboQuant Tool Call Regression Suite (Issue #96)
|
||||
|
||||
Verifies that TurboQuant-compressed models still handle hermes tool calling
|
||||
correctly. Tests schema parsing, execution, and parallel tool calls.
|
||||
|
||||
Usage:
|
||||
python3 tests/tool_call_regression.py \
|
||||
--endpoint http://localhost:8081/v1 \
|
||||
--model gemma-4 \
|
||||
--kv-type turbo4 \
|
||||
--runs 3
|
||||
|
||||
# Dry run (no server needed — validates schemas only):
|
||||
python3 tests/tool_call_regression.py --dry-run
|
||||
|
||||
Acceptance: tool call accuracy must be >= 95% across all test cases.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
|
||||
# ── Tool schemas (hermes-compatible) ──────────────────────────────
|
||||
|
||||
TOOL_SCHEMAS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "read_file",
|
||||
"description": "Read a text file with line numbers and pagination.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {
|
||||
"type": "string",
|
||||
"description": "File path to read (absolute or relative)"
|
||||
},
|
||||
"offset": {
|
||||
"type": "integer",
|
||||
"description": "Line number to start reading from (1-indexed)",
|
||||
"default": 1
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of lines to return",
|
||||
"default": 500
|
||||
}
|
||||
},
|
||||
"required": ["path"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "web_search",
|
||||
"description": "Search the web for information using a query string.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "Search query"
|
||||
},
|
||||
"num_results": {
|
||||
"type": "integer",
|
||||
"description": "Number of results to return",
|
||||
"default": 5
|
||||
}
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "terminal",
|
||||
"description": "Execute a shell command on the system.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"command": {
|
||||
"type": "string",
|
||||
"description": "Shell command to execute"
|
||||
},
|
||||
"timeout": {
|
||||
"type": "integer",
|
||||
"description": "Timeout in seconds",
|
||||
"default": 30
|
||||
}
|
||||
},
|
||||
"required": ["command"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "execute_code",
|
||||
"description": "Run a Python script in a sandboxed environment.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"code": {
|
||||
"type": "string",
|
||||
"description": "Python code to execute"
|
||||
}
|
||||
},
|
||||
"required": ["code"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "delegate_task",
|
||||
"description": "Spawn a subagent to work on a task in an isolated context.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"goal": {
|
||||
"type": "string",
|
||||
"description": "What the subagent should accomplish"
|
||||
},
|
||||
"context": {
|
||||
"type": "string",
|
||||
"description": "Background information the subagent needs"
|
||||
},
|
||||
"toolsets": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "Toolsets to enable for this subagent"
|
||||
}
|
||||
},
|
||||
"required": ["goal"]
|
||||
}
|
||||
}
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ── Test prompts ──────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class ToolCallTest:
|
||||
"""A single test case for tool calling."""
|
||||
id: str
|
||||
name: str
|
||||
prompt: str
|
||||
expected_tool: str
|
||||
expected_args: dict # subset of expected args
|
||||
description: str = ""
|
||||
|
||||
|
||||
TEST_CASES = [
|
||||
ToolCallTest(
|
||||
id="read_file_basic",
|
||||
name="Read File — basic path",
|
||||
prompt="Read the file at /tmp/test.txt and show me the first 10 lines.",
|
||||
expected_tool="read_file",
|
||||
expected_args={"path": "/tmp/test.txt"},
|
||||
description="Basic file read with path argument",
|
||||
),
|
||||
ToolCallTest(
|
||||
id="read_file_offset",
|
||||
name="Read File — with offset",
|
||||
prompt="Read lines 50 through 80 of /var/log/system.log",
|
||||
expected_tool="read_file",
|
||||
expected_args={"path": "/var/log/system.log"},
|
||||
description="File read with offset parameter",
|
||||
),
|
||||
ToolCallTest(
|
||||
id="web_search_basic",
|
||||
name="Web Search — basic query",
|
||||
prompt="Search the web for 'TurboQuant KV cache compression benchmarks'",
|
||||
expected_tool="web_search",
|
||||
expected_args={"query": "turboquant"},
|
||||
description="Web search with query containing keywords",
|
||||
),
|
||||
ToolCallTest(
|
||||
id="terminal_basic",
|
||||
name="Terminal — simple command",
|
||||
prompt="Run `ls -la /tmp` to see what files are there.",
|
||||
expected_tool="terminal",
|
||||
expected_args={"command": "ls"},
|
||||
description="Terminal command execution",
|
||||
),
|
||||
ToolCallTest(
|
||||
id="terminal_complex",
|
||||
name="Terminal — complex command",
|
||||
prompt="Check the disk usage of the current directory with `du -sh .`",
|
||||
expected_tool="terminal",
|
||||
expected_args={"command": "du"},
|
||||
description="Terminal with different command",
|
||||
),
|
||||
ToolCallTest(
|
||||
id="code_exec_basic",
|
||||
name="Code Execution — python",
|
||||
prompt="Run this Python code: print(sum(range(100)))",
|
||||
expected_tool="execute_code",
|
||||
expected_args={"code": "sum"},
|
||||
description="Code execution with Python",
|
||||
),
|
||||
ToolCallTest(
|
||||
id="code_exec_complex",
|
||||
name="Code Execution — multi-line",
|
||||
prompt="Write and run Python code that reads a CSV file and counts the rows. Use the csv module.",
|
||||
expected_tool="execute_code",
|
||||
expected_args={"code": "csv"},
|
||||
description="Code execution with multi-line Python",
|
||||
),
|
||||
ToolCallTest(
|
||||
id="delegate_basic",
|
||||
name="Delegate Task — simple",
|
||||
prompt="Delegate this task to a subagent: research the latest llama.cpp release notes.",
|
||||
expected_tool="delegate_task",
|
||||
expected_args={"goal": "llama"},
|
||||
description="Task delegation with goal",
|
||||
),
|
||||
ToolCallTest(
|
||||
id="delegate_context",
|
||||
name="Delegate Task — with context",
|
||||
prompt="Spawn a subagent to review the Python files in /src. Context: look for security issues.",
|
||||
expected_tool="delegate_task",
|
||||
expected_args={"goal": "review"},
|
||||
description="Task delegation with context",
|
||||
),
|
||||
ToolCallTest(
|
||||
id="parallel_two",
|
||||
name="Parallel Tools — two in one response",
|
||||
prompt="Read the file /etc/hostname AND check the current date by running `date`. Do both at the same time.",
|
||||
expected_tool="read_file", # at least one of the two
|
||||
expected_args={"path": "/etc/hostname"},
|
||||
description="Two tool calls in a single response",
|
||||
# Note: this test checks that at least 2 tool calls are returned
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
# ── Result types ──────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class TestResult:
|
||||
id: str
|
||||
name: str
|
||||
passed: bool
|
||||
tool_called: Optional[str] = None
|
||||
expected_tool: str = ""
|
||||
schema_valid: bool = False
|
||||
args_valid: bool = False
|
||||
latency_ms: float = 0.0
|
||||
raw_response: str = ""
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class SuiteResult:
|
||||
timestamp: str
|
||||
model: str
|
||||
endpoint: str
|
||||
kv_type: str
|
||||
total: int = 0
|
||||
passed: int = 0
|
||||
failed: int = 0
|
||||
accuracy: float = 0.0
|
||||
meets_threshold: bool = False
|
||||
threshold: float = 0.95
|
||||
results: list = field(default_factory=list)
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
# ── Schema validation ────────────────────────────────────────────
|
||||
|
||||
def validate_tool_call_schema(call: dict) -> bool:
|
||||
"""Validate that a tool call response has the expected structure."""
|
||||
if not isinstance(call, dict):
|
||||
return False
|
||||
|
||||
# OpenAI format: { "type": "function", "function": { "name": "...", "arguments": "{}" } }
|
||||
if call.get("type") == "function":
|
||||
func = call.get("function", {})
|
||||
return (
|
||||
isinstance(func.get("name"), str) and len(func["name"]) > 0
|
||||
and isinstance(func.get("arguments"), str)
|
||||
)
|
||||
|
||||
# Alternative format: { "name": "...", "arguments": "{}" }
|
||||
if "name" in call and "arguments" in call:
|
||||
return (
|
||||
isinstance(call["name"], str) and len(call["name"]) > 0
|
||||
and isinstance(call["arguments"], str)
|
||||
)
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def validate_tool_args(args_str: str, expected: dict) -> bool:
|
||||
"""Validate that tool arguments contain expected keys/values."""
|
||||
try:
|
||||
args = json.loads(args_str)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return False
|
||||
|
||||
if not isinstance(args, dict):
|
||||
return False
|
||||
|
||||
for key, value in expected.items():
|
||||
if key not in args:
|
||||
return False
|
||||
# For string values, check substring match
|
||||
if isinstance(value, str) and isinstance(args[key], str):
|
||||
if value.lower() not in args[key].lower():
|
||||
return False
|
||||
# For non-string values, check exact match
|
||||
elif args[key] != value:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def extract_tool_calls(response: dict) -> list:
|
||||
"""Extract tool calls from an API response."""
|
||||
choices = response.get("choices", [])
|
||||
if not choices:
|
||||
return []
|
||||
|
||||
message = choices[0].get("message", {})
|
||||
|
||||
# Standard OpenAI format
|
||||
tool_calls = message.get("tool_calls", [])
|
||||
if tool_calls:
|
||||
return tool_calls
|
||||
|
||||
# Some models return tool calls in content as JSON
|
||||
content = message.get("content", "")
|
||||
if content:
|
||||
# Try to parse content as JSON tool call
|
||||
try:
|
||||
parsed = json.loads(content)
|
||||
if isinstance(parsed, dict) and "name" in parsed and "arguments" in parsed:
|
||||
return [parsed]
|
||||
if isinstance(parsed, list):
|
||||
return [c for c in parsed if isinstance(c, dict) and "name" in c]
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
# Look for JSON blocks in content
|
||||
json_match = re.search(r'\{[^{}]*"name"\s*:\s*"[^"]*"[^{}]*\}', content)
|
||||
if json_match:
|
||||
try:
|
||||
return [json.loads(json_match.group())]
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
return []
|
||||
|
||||
|
||||
# ── API interaction ───────────────────────────────────────────────
|
||||
|
||||
def call_model(endpoint: str, model: str, messages: list, tools: list,
|
||||
temperature: float = 0.1, timeout: int = 60) -> dict:
|
||||
"""Call the model via OpenAI-compatible API."""
|
||||
import urllib.request
|
||||
|
||||
payload = json.dumps({
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"tools": tools,
|
||||
"temperature": temperature,
|
||||
"max_tokens": 1024,
|
||||
}).encode()
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{endpoint}/chat/completions",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
try:
|
||||
resp = urllib.request.urlopen(req, timeout=timeout)
|
||||
data = json.loads(resp.read())
|
||||
data["_latency_ms"] = (time.time() - start) * 1000
|
||||
return data
|
||||
except Exception as e:
|
||||
return {"error": str(e), "_latency_ms": (time.time() - start) * 1000}
|
||||
|
||||
|
||||
# ── Test runner ───────────────────────────────────────────────────
|
||||
|
||||
def run_single_test(endpoint: str, model: str, test: ToolCallTest) -> TestResult:
|
||||
"""Run a single tool call test."""
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful assistant. When the user asks you to perform "
|
||||
"a task, use the appropriate tool. Always call exactly one tool "
|
||||
"unless the user explicitly asks for multiple things."
|
||||
),
|
||||
},
|
||||
{"role": "user", "content": test.prompt},
|
||||
]
|
||||
|
||||
response = call_model(endpoint, model, messages, TOOL_SCHEMAS)
|
||||
|
||||
if "error" in response:
|
||||
return TestResult(
|
||||
id=test.id,
|
||||
name=test.name,
|
||||
passed=False,
|
||||
expected_tool=test.expected_tool,
|
||||
error=response["error"],
|
||||
latency_ms=response.get("_latency_ms", 0),
|
||||
)
|
||||
|
||||
tool_calls = extract_tool_calls(response)
|
||||
latency = response.get("_latency_ms", 0)
|
||||
|
||||
if not tool_calls:
|
||||
# Model didn't call any tool
|
||||
content = response.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||
return TestResult(
|
||||
id=test.id,
|
||||
name=test.name,
|
||||
passed=False,
|
||||
expected_tool=test.expected_tool,
|
||||
latency_ms=latency,
|
||||
raw_response=content[:500],
|
||||
error="No tool call returned",
|
||||
)
|
||||
|
||||
# Validate first tool call
|
||||
call = tool_calls[0]
|
||||
schema_valid = validate_tool_call_schema(call)
|
||||
|
||||
# Extract tool name
|
||||
if call.get("type") == "function":
|
||||
tool_name = call["function"]["name"]
|
||||
args_str = call["function"]["arguments"]
|
||||
else:
|
||||
tool_name = call.get("name", "")
|
||||
args_str = call.get("arguments", "{}")
|
||||
|
||||
args_valid = validate_tool_args(args_str, test.expected_args)
|
||||
tool_correct = tool_name == test.expected_tool
|
||||
passed = tool_correct and schema_valid
|
||||
|
||||
return TestResult(
|
||||
id=test.id,
|
||||
name=test.name,
|
||||
passed=passed,
|
||||
tool_called=tool_name,
|
||||
expected_tool=test.expected_tool,
|
||||
schema_valid=schema_valid,
|
||||
args_valid=args_valid,
|
||||
latency_ms=latency,
|
||||
raw_response=json.dumps(tool_calls[:2])[:500],
|
||||
)
|
||||
|
||||
|
||||
def run_dry_run() -> SuiteResult:
|
||||
"""Validate schemas and test structure without a running server."""
|
||||
print("=== DRY RUN — Schema Validation Only ===\n")
|
||||
|
||||
results = []
|
||||
for test in TEST_CASES:
|
||||
# Validate schemas parse
|
||||
schema_valid = True
|
||||
for tool in TOOL_SCHEMAS:
|
||||
try:
|
||||
assert "type" in tool
|
||||
assert tool["type"] == "function"
|
||||
func = tool["function"]
|
||||
assert "name" in func
|
||||
assert "description" in func
|
||||
assert "parameters" in func
|
||||
params = func["parameters"]
|
||||
assert "type" in params
|
||||
assert "properties" in params
|
||||
except AssertionError:
|
||||
schema_valid = False
|
||||
|
||||
results.append(TestResult(
|
||||
id=test.id,
|
||||
name=test.name,
|
||||
passed=schema_valid,
|
||||
expected_tool=test.expected_tool,
|
||||
schema_valid=schema_valid,
|
||||
args_valid=True,
|
||||
))
|
||||
|
||||
passed = sum(1 for r in results if r.passed)
|
||||
suite = SuiteResult(
|
||||
timestamp=datetime.now(timezone.utc).isoformat(),
|
||||
model="dry-run",
|
||||
endpoint="none",
|
||||
kv_type="none",
|
||||
total=len(results),
|
||||
passed=passed,
|
||||
failed=len(results) - passed,
|
||||
accuracy=passed / len(results) if results else 0,
|
||||
meets_threshold=passed == len(results),
|
||||
threshold=1.0,
|
||||
results=[asdict(r) for r in results],
|
||||
)
|
||||
|
||||
return suite
|
||||
|
||||
|
||||
def run_suite(endpoint: str, model: str, kv_type: str, runs: int = 1,
|
||||
threshold: float = 0.95) -> SuiteResult:
|
||||
"""Run the full tool call regression suite."""
|
||||
print(f"=== TurboQuant Tool Call Regression Suite ===")
|
||||
print(f"Endpoint: {endpoint}")
|
||||
print(f"Model: {model}")
|
||||
print(f"KV Type: {kv_type}")
|
||||
print(f"Runs: {runs}")
|
||||
print(f"Threshold: {threshold:.0%}")
|
||||
print()
|
||||
|
||||
# Check server is reachable
|
||||
try:
|
||||
import urllib.request
|
||||
health_req = urllib.request.Request(f"{endpoint}/models", method="GET")
|
||||
urllib.request.urlopen(health_req, timeout=5)
|
||||
except Exception as e:
|
||||
return SuiteResult(
|
||||
timestamp=datetime.now(timezone.utc).isoformat(),
|
||||
model=model,
|
||||
endpoint=endpoint,
|
||||
kv_type=kv_type,
|
||||
error=f"Server unreachable: {e}",
|
||||
)
|
||||
|
||||
all_results = []
|
||||
for run_idx in range(runs):
|
||||
if runs > 1:
|
||||
print(f"\n--- Run {run_idx + 1}/{runs} ---")
|
||||
|
||||
for test in TEST_CASES:
|
||||
print(f" {test.id}: ", end="", flush=True)
|
||||
result = run_single_test(endpoint, model, test)
|
||||
status = "PASS" if result.passed else "FAIL"
|
||||
tool_info = f"called={result.tool_called}" if result.tool_called else "no tool"
|
||||
print(f"{status} ({tool_info}, {result.latency_ms:.0f}ms)")
|
||||
if result.error:
|
||||
print(f" Error: {result.error}")
|
||||
all_results.append(result)
|
||||
|
||||
passed = sum(1 for r in all_results if r.passed)
|
||||
total = len(all_results)
|
||||
accuracy = passed / total if total > 0 else 0
|
||||
|
||||
suite = SuiteResult(
|
||||
timestamp=datetime.now(timezone.utc).isoformat(),
|
||||
model=model,
|
||||
endpoint=endpoint,
|
||||
kv_type=kv_type,
|
||||
total=total,
|
||||
passed=passed,
|
||||
failed=total - passed,
|
||||
accuracy=accuracy,
|
||||
meets_threshold=accuracy >= threshold,
|
||||
threshold=threshold,
|
||||
results=[asdict(r) for r in all_results],
|
||||
)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"RESULTS: {passed}/{total} passed ({accuracy:.1%})")
|
||||
print(f"Threshold: {threshold:.0%}")
|
||||
print(f"VERDICT: {'PASS' if suite.meets_threshold else 'FAIL'}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
return suite
|
||||
|
||||
|
||||
# ── Markdown report ───────────────────────────────────────────────
|
||||
|
||||
def generate_report(suite: SuiteResult, output_path: str) -> None:
|
||||
"""Generate a markdown results matrix."""
|
||||
lines = [
|
||||
"# Tool Call Regression Results",
|
||||
"",
|
||||
f"**Generated:** {suite.timestamp}",
|
||||
f"**Model:** {suite.model}",
|
||||
f"**Endpoint:** {suite.endpoint}",
|
||||
f"**KV Type:** {suite.kv_type}",
|
||||
"",
|
||||
"## Summary",
|
||||
"",
|
||||
f"| Metric | Value |",
|
||||
f"|--------|-------|",
|
||||
f"| Total tests | {suite.total} |",
|
||||
f"| Passed | {suite.passed} |",
|
||||
f"| Failed | {suite.failed} |",
|
||||
f"| Accuracy | {suite.accuracy:.1%} |",
|
||||
f"| Threshold | {suite.threshold:.0%} |",
|
||||
f"| Verdict | {'PASS' if suite.meets_threshold else 'FAIL'} |",
|
||||
"",
|
||||
"## Test Matrix",
|
||||
"",
|
||||
"| Test ID | Tool Expected | Tool Called | Schema | Args | Latency | Status |",
|
||||
"|---------|--------------|-------------|--------|------|---------|--------|",
|
||||
]
|
||||
|
||||
for r in suite.results:
|
||||
d = r if isinstance(r, dict) else asdict(r)
|
||||
status = "PASS" if d["passed"] else "FAIL"
|
||||
schema = "OK" if d.get("schema_valid") else "FAIL"
|
||||
args = "OK" if d.get("args_valid") else "FAIL"
|
||||
called = d.get("tool_called") or "none"
|
||||
latency = f"{d.get('latency_ms', 0):.0f}ms"
|
||||
lines.append(
|
||||
f"| {d['id']} | {d['expected_tool']} | {called} | {schema} | {args} | {latency} | {status} |"
|
||||
)
|
||||
|
||||
if suite.error:
|
||||
lines.extend(["", "## Error", "", suite.error])
|
||||
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
with open(output_path, "w") as f:
|
||||
f.write("\n".join(lines) + "\n")
|
||||
print(f"\nReport saved to {output_path}")
|
||||
|
||||
|
||||
# ── Main ──────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="TurboQuant Tool Call Regression Suite")
|
||||
parser.add_argument("--endpoint", default="http://localhost:8081/v1",
|
||||
help="llama.cpp OpenAI-compatible endpoint")
|
||||
parser.add_argument("--model", default="gemma-4", help="Model name")
|
||||
parser.add_argument("--kv-type", default="turbo4", help="KV cache type being tested")
|
||||
parser.add_argument("--runs", type=int, default=1, help="Number of runs per test")
|
||||
parser.add_argument("--threshold", type=float, default=0.95,
|
||||
help="Minimum accuracy to pass (0.0-1.0)")
|
||||
parser.add_argument("--output", default="benchmarks/tool-call-regression.md",
|
||||
help="Output markdown report path")
|
||||
parser.add_argument("--results-json", default="benchmarks/tool-call-regression.json",
|
||||
help="Output JSON results path")
|
||||
parser.add_argument("--dry-run", action="store_true",
|
||||
help="Validate schemas only, no server needed")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.dry_run:
|
||||
suite = run_dry_run()
|
||||
else:
|
||||
suite = run_suite(
|
||||
endpoint=args.endpoint,
|
||||
model=args.model,
|
||||
kv_type=args.kv_type,
|
||||
runs=args.runs,
|
||||
threshold=args.threshold,
|
||||
)
|
||||
|
||||
# Save results
|
||||
generate_report(suite, args.output)
|
||||
os.makedirs(os.path.dirname(args.results_json), exist_ok=True)
|
||||
with open(args.results_json, "w") as f:
|
||||
json.dump(asdict(suite), f, indent=2)
|
||||
print(f"JSON results saved to {args.results_json}")
|
||||
|
||||
# Exit code: 0 if passes threshold, 1 otherwise
|
||||
sys.exit(0 if suite.meets_threshold else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user