Files
turboquant/tests/tool_call_regression.py
PRIMA 02c0cc2b23
All checks were successful
Smoke Test / smoke (pull_request) Successful in 15s
test: tool call regression suite for compressed models (closes #96)
tests/tool_call_regression.py:
- 10 test cases covering 5 hermes tools: read_file, web_search, terminal,
  execute_code, delegate_task
- Schema validation (OpenAI-compatible tool call format)
- Argument validation (correct tool + expected args)
- Parallel tool calling test (multiple tools in one response)
- Dry-run mode for CI (schema validation without server)
- Full server mode with latency tracking
- Markdown report generation with results matrix
- JSON results output for programmatic consumption
- 95% accuracy threshold gate (exit code 1 on failure)

benchmarks/tool-call-regression.md:
- Results template with model/preset matrix
- Tool coverage tracking table

.gitea/workflows/smoke.yml:
- Added dry-run tool call schema validation step
2026-04-15 21:58:34 -04:00

679 lines
23 KiB
Python

#!/usr/bin/env python3
"""
TurboQuant Tool Call Regression Suite (Issue #96)
Verifies that TurboQuant-compressed models still handle hermes tool calling
correctly. Tests schema parsing, execution, and parallel tool calls.
Usage:
python3 tests/tool_call_regression.py \
--endpoint http://localhost:8081/v1 \
--model gemma-4 \
--kv-type turbo4 \
--runs 3
# Dry run (no server needed — validates schemas only):
python3 tests/tool_call_regression.py --dry-run
Acceptance: tool call accuracy must be >= 95% across all test cases.
"""
import argparse
import json
import os
import re
import sys
import time
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from typing import Optional
# ── Tool schemas (hermes-compatible) ──────────────────────────────
TOOL_SCHEMAS = [
{
"type": "function",
"function": {
"name": "read_file",
"description": "Read a text file with line numbers and pagination.",
"parameters": {
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "File path to read (absolute or relative)"
},
"offset": {
"type": "integer",
"description": "Line number to start reading from (1-indexed)",
"default": 1
},
"limit": {
"type": "integer",
"description": "Maximum number of lines to return",
"default": 500
}
},
"required": ["path"]
}
}
},
{
"type": "function",
"function": {
"name": "web_search",
"description": "Search the web for information using a query string.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Search query"
},
"num_results": {
"type": "integer",
"description": "Number of results to return",
"default": 5
}
},
"required": ["query"]
}
}
},
{
"type": "function",
"function": {
"name": "terminal",
"description": "Execute a shell command on the system.",
"parameters": {
"type": "object",
"properties": {
"command": {
"type": "string",
"description": "Shell command to execute"
},
"timeout": {
"type": "integer",
"description": "Timeout in seconds",
"default": 30
}
},
"required": ["command"]
}
}
},
{
"type": "function",
"function": {
"name": "execute_code",
"description": "Run a Python script in a sandboxed environment.",
"parameters": {
"type": "object",
"properties": {
"code": {
"type": "string",
"description": "Python code to execute"
}
},
"required": ["code"]
}
}
},
{
"type": "function",
"function": {
"name": "delegate_task",
"description": "Spawn a subagent to work on a task in an isolated context.",
"parameters": {
"type": "object",
"properties": {
"goal": {
"type": "string",
"description": "What the subagent should accomplish"
},
"context": {
"type": "string",
"description": "Background information the subagent needs"
},
"toolsets": {
"type": "array",
"items": {"type": "string"},
"description": "Toolsets to enable for this subagent"
}
},
"required": ["goal"]
}
}
},
]
# ── Test prompts ──────────────────────────────────────────────────
@dataclass
class ToolCallTest:
"""A single test case for tool calling."""
id: str
name: str
prompt: str
expected_tool: str
expected_args: dict # subset of expected args
description: str = ""
TEST_CASES = [
ToolCallTest(
id="read_file_basic",
name="Read File — basic path",
prompt="Read the file at /tmp/test.txt and show me the first 10 lines.",
expected_tool="read_file",
expected_args={"path": "/tmp/test.txt"},
description="Basic file read with path argument",
),
ToolCallTest(
id="read_file_offset",
name="Read File — with offset",
prompt="Read lines 50 through 80 of /var/log/system.log",
expected_tool="read_file",
expected_args={"path": "/var/log/system.log"},
description="File read with offset parameter",
),
ToolCallTest(
id="web_search_basic",
name="Web Search — basic query",
prompt="Search the web for 'TurboQuant KV cache compression benchmarks'",
expected_tool="web_search",
expected_args={"query": "turboquant"},
description="Web search with query containing keywords",
),
ToolCallTest(
id="terminal_basic",
name="Terminal — simple command",
prompt="Run `ls -la /tmp` to see what files are there.",
expected_tool="terminal",
expected_args={"command": "ls"},
description="Terminal command execution",
),
ToolCallTest(
id="terminal_complex",
name="Terminal — complex command",
prompt="Check the disk usage of the current directory with `du -sh .`",
expected_tool="terminal",
expected_args={"command": "du"},
description="Terminal with different command",
),
ToolCallTest(
id="code_exec_basic",
name="Code Execution — python",
prompt="Run this Python code: print(sum(range(100)))",
expected_tool="execute_code",
expected_args={"code": "sum"},
description="Code execution with Python",
),
ToolCallTest(
id="code_exec_complex",
name="Code Execution — multi-line",
prompt="Write and run Python code that reads a CSV file and counts the rows. Use the csv module.",
expected_tool="execute_code",
expected_args={"code": "csv"},
description="Code execution with multi-line Python",
),
ToolCallTest(
id="delegate_basic",
name="Delegate Task — simple",
prompt="Delegate this task to a subagent: research the latest llama.cpp release notes.",
expected_tool="delegate_task",
expected_args={"goal": "llama"},
description="Task delegation with goal",
),
ToolCallTest(
id="delegate_context",
name="Delegate Task — with context",
prompt="Spawn a subagent to review the Python files in /src. Context: look for security issues.",
expected_tool="delegate_task",
expected_args={"goal": "review"},
description="Task delegation with context",
),
ToolCallTest(
id="parallel_two",
name="Parallel Tools — two in one response",
prompt="Read the file /etc/hostname AND check the current date by running `date`. Do both at the same time.",
expected_tool="read_file", # at least one of the two
expected_args={"path": "/etc/hostname"},
description="Two tool calls in a single response",
# Note: this test checks that at least 2 tool calls are returned
),
]
# ── Result types ──────────────────────────────────────────────────
@dataclass
class TestResult:
id: str
name: str
passed: bool
tool_called: Optional[str] = None
expected_tool: str = ""
schema_valid: bool = False
args_valid: bool = False
latency_ms: float = 0.0
raw_response: str = ""
error: Optional[str] = None
@dataclass
class SuiteResult:
timestamp: str
model: str
endpoint: str
kv_type: str
total: int = 0
passed: int = 0
failed: int = 0
accuracy: float = 0.0
meets_threshold: bool = False
threshold: float = 0.95
results: list = field(default_factory=list)
error: Optional[str] = None
# ── Schema validation ────────────────────────────────────────────
def validate_tool_call_schema(call: dict) -> bool:
"""Validate that a tool call response has the expected structure."""
if not isinstance(call, dict):
return False
# OpenAI format: { "type": "function", "function": { "name": "...", "arguments": "{}" } }
if call.get("type") == "function":
func = call.get("function", {})
return (
isinstance(func.get("name"), str) and len(func["name"]) > 0
and isinstance(func.get("arguments"), str)
)
# Alternative format: { "name": "...", "arguments": "{}" }
if "name" in call and "arguments" in call:
return (
isinstance(call["name"], str) and len(call["name"]) > 0
and isinstance(call["arguments"], str)
)
return False
def validate_tool_args(args_str: str, expected: dict) -> bool:
"""Validate that tool arguments contain expected keys/values."""
try:
args = json.loads(args_str)
except (json.JSONDecodeError, TypeError):
return False
if not isinstance(args, dict):
return False
for key, value in expected.items():
if key not in args:
return False
# For string values, check substring match
if isinstance(value, str) and isinstance(args[key], str):
if value.lower() not in args[key].lower():
return False
# For non-string values, check exact match
elif args[key] != value:
return False
return True
def extract_tool_calls(response: dict) -> list:
"""Extract tool calls from an API response."""
choices = response.get("choices", [])
if not choices:
return []
message = choices[0].get("message", {})
# Standard OpenAI format
tool_calls = message.get("tool_calls", [])
if tool_calls:
return tool_calls
# Some models return tool calls in content as JSON
content = message.get("content", "")
if content:
# Try to parse content as JSON tool call
try:
parsed = json.loads(content)
if isinstance(parsed, dict) and "name" in parsed and "arguments" in parsed:
return [parsed]
if isinstance(parsed, list):
return [c for c in parsed if isinstance(c, dict) and "name" in c]
except (json.JSONDecodeError, TypeError):
# Look for JSON blocks in content
json_match = re.search(r'\{[^{}]*"name"\s*:\s*"[^"]*"[^{}]*\}', content)
if json_match:
try:
return [json.loads(json_match.group())]
except json.JSONDecodeError:
pass
return []
# ── API interaction ───────────────────────────────────────────────
def call_model(endpoint: str, model: str, messages: list, tools: list,
temperature: float = 0.1, timeout: int = 60) -> dict:
"""Call the model via OpenAI-compatible API."""
import urllib.request
payload = json.dumps({
"model": model,
"messages": messages,
"tools": tools,
"temperature": temperature,
"max_tokens": 1024,
}).encode()
req = urllib.request.Request(
f"{endpoint}/chat/completions",
data=payload,
headers={"Content-Type": "application/json"},
method="POST",
)
start = time.time()
try:
resp = urllib.request.urlopen(req, timeout=timeout)
data = json.loads(resp.read())
data["_latency_ms"] = (time.time() - start) * 1000
return data
except Exception as e:
return {"error": str(e), "_latency_ms": (time.time() - start) * 1000}
# ── Test runner ───────────────────────────────────────────────────
def run_single_test(endpoint: str, model: str, test: ToolCallTest) -> TestResult:
"""Run a single tool call test."""
messages = [
{
"role": "system",
"content": (
"You are a helpful assistant. When the user asks you to perform "
"a task, use the appropriate tool. Always call exactly one tool "
"unless the user explicitly asks for multiple things."
),
},
{"role": "user", "content": test.prompt},
]
response = call_model(endpoint, model, messages, TOOL_SCHEMAS)
if "error" in response:
return TestResult(
id=test.id,
name=test.name,
passed=False,
expected_tool=test.expected_tool,
error=response["error"],
latency_ms=response.get("_latency_ms", 0),
)
tool_calls = extract_tool_calls(response)
latency = response.get("_latency_ms", 0)
if not tool_calls:
# Model didn't call any tool
content = response.get("choices", [{}])[0].get("message", {}).get("content", "")
return TestResult(
id=test.id,
name=test.name,
passed=False,
expected_tool=test.expected_tool,
latency_ms=latency,
raw_response=content[:500],
error="No tool call returned",
)
# Validate first tool call
call = tool_calls[0]
schema_valid = validate_tool_call_schema(call)
# Extract tool name
if call.get("type") == "function":
tool_name = call["function"]["name"]
args_str = call["function"]["arguments"]
else:
tool_name = call.get("name", "")
args_str = call.get("arguments", "{}")
args_valid = validate_tool_args(args_str, test.expected_args)
tool_correct = tool_name == test.expected_tool
passed = tool_correct and schema_valid
return TestResult(
id=test.id,
name=test.name,
passed=passed,
tool_called=tool_name,
expected_tool=test.expected_tool,
schema_valid=schema_valid,
args_valid=args_valid,
latency_ms=latency,
raw_response=json.dumps(tool_calls[:2])[:500],
)
def run_dry_run() -> SuiteResult:
"""Validate schemas and test structure without a running server."""
print("=== DRY RUN — Schema Validation Only ===\n")
results = []
for test in TEST_CASES:
# Validate schemas parse
schema_valid = True
for tool in TOOL_SCHEMAS:
try:
assert "type" in tool
assert tool["type"] == "function"
func = tool["function"]
assert "name" in func
assert "description" in func
assert "parameters" in func
params = func["parameters"]
assert "type" in params
assert "properties" in params
except AssertionError:
schema_valid = False
results.append(TestResult(
id=test.id,
name=test.name,
passed=schema_valid,
expected_tool=test.expected_tool,
schema_valid=schema_valid,
args_valid=True,
))
passed = sum(1 for r in results if r.passed)
suite = SuiteResult(
timestamp=datetime.now(timezone.utc).isoformat(),
model="dry-run",
endpoint="none",
kv_type="none",
total=len(results),
passed=passed,
failed=len(results) - passed,
accuracy=passed / len(results) if results else 0,
meets_threshold=passed == len(results),
threshold=1.0,
results=[asdict(r) for r in results],
)
return suite
def run_suite(endpoint: str, model: str, kv_type: str, runs: int = 1,
threshold: float = 0.95) -> SuiteResult:
"""Run the full tool call regression suite."""
print(f"=== TurboQuant Tool Call Regression Suite ===")
print(f"Endpoint: {endpoint}")
print(f"Model: {model}")
print(f"KV Type: {kv_type}")
print(f"Runs: {runs}")
print(f"Threshold: {threshold:.0%}")
print()
# Check server is reachable
try:
import urllib.request
health_req = urllib.request.Request(f"{endpoint}/models", method="GET")
urllib.request.urlopen(health_req, timeout=5)
except Exception as e:
return SuiteResult(
timestamp=datetime.now(timezone.utc).isoformat(),
model=model,
endpoint=endpoint,
kv_type=kv_type,
error=f"Server unreachable: {e}",
)
all_results = []
for run_idx in range(runs):
if runs > 1:
print(f"\n--- Run {run_idx + 1}/{runs} ---")
for test in TEST_CASES:
print(f" {test.id}: ", end="", flush=True)
result = run_single_test(endpoint, model, test)
status = "PASS" if result.passed else "FAIL"
tool_info = f"called={result.tool_called}" if result.tool_called else "no tool"
print(f"{status} ({tool_info}, {result.latency_ms:.0f}ms)")
if result.error:
print(f" Error: {result.error}")
all_results.append(result)
passed = sum(1 for r in all_results if r.passed)
total = len(all_results)
accuracy = passed / total if total > 0 else 0
suite = SuiteResult(
timestamp=datetime.now(timezone.utc).isoformat(),
model=model,
endpoint=endpoint,
kv_type=kv_type,
total=total,
passed=passed,
failed=total - passed,
accuracy=accuracy,
meets_threshold=accuracy >= threshold,
threshold=threshold,
results=[asdict(r) for r in all_results],
)
print(f"\n{'='*60}")
print(f"RESULTS: {passed}/{total} passed ({accuracy:.1%})")
print(f"Threshold: {threshold:.0%}")
print(f"VERDICT: {'PASS' if suite.meets_threshold else 'FAIL'}")
print(f"{'='*60}")
return suite
# ── Markdown report ───────────────────────────────────────────────
def generate_report(suite: SuiteResult, output_path: str) -> None:
"""Generate a markdown results matrix."""
lines = [
"# Tool Call Regression Results",
"",
f"**Generated:** {suite.timestamp}",
f"**Model:** {suite.model}",
f"**Endpoint:** {suite.endpoint}",
f"**KV Type:** {suite.kv_type}",
"",
"## Summary",
"",
f"| Metric | Value |",
f"|--------|-------|",
f"| Total tests | {suite.total} |",
f"| Passed | {suite.passed} |",
f"| Failed | {suite.failed} |",
f"| Accuracy | {suite.accuracy:.1%} |",
f"| Threshold | {suite.threshold:.0%} |",
f"| Verdict | {'PASS' if suite.meets_threshold else 'FAIL'} |",
"",
"## Test Matrix",
"",
"| Test ID | Tool Expected | Tool Called | Schema | Args | Latency | Status |",
"|---------|--------------|-------------|--------|------|---------|--------|",
]
for r in suite.results:
d = r if isinstance(r, dict) else asdict(r)
status = "PASS" if d["passed"] else "FAIL"
schema = "OK" if d.get("schema_valid") else "FAIL"
args = "OK" if d.get("args_valid") else "FAIL"
called = d.get("tool_called") or "none"
latency = f"{d.get('latency_ms', 0):.0f}ms"
lines.append(
f"| {d['id']} | {d['expected_tool']} | {called} | {schema} | {args} | {latency} | {status} |"
)
if suite.error:
lines.extend(["", "## Error", "", suite.error])
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w") as f:
f.write("\n".join(lines) + "\n")
print(f"\nReport saved to {output_path}")
# ── Main ──────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="TurboQuant Tool Call Regression Suite")
parser.add_argument("--endpoint", default="http://localhost:8081/v1",
help="llama.cpp OpenAI-compatible endpoint")
parser.add_argument("--model", default="gemma-4", help="Model name")
parser.add_argument("--kv-type", default="turbo4", help="KV cache type being tested")
parser.add_argument("--runs", type=int, default=1, help="Number of runs per test")
parser.add_argument("--threshold", type=float, default=0.95,
help="Minimum accuracy to pass (0.0-1.0)")
parser.add_argument("--output", default="benchmarks/tool-call-regression.md",
help="Output markdown report path")
parser.add_argument("--results-json", default="benchmarks/tool-call-regression.json",
help="Output JSON results path")
parser.add_argument("--dry-run", action="store_true",
help="Validate schemas only, no server needed")
args = parser.parse_args()
if args.dry_run:
suite = run_dry_run()
else:
suite = run_suite(
endpoint=args.endpoint,
model=args.model,
kv_type=args.kv_type,
runs=args.runs,
threshold=args.threshold,
)
# Save results
generate_report(suite, args.output)
os.makedirs(os.path.dirname(args.results_json), exist_ok=True)
with open(args.results_json, "w") as f:
json.dump(asdict(suite), f, indent=2)
print(f"JSON results saved to {args.results_json}")
# Exit code: 0 if passes threshold, 1 otherwise
sys.exit(0 if suite.meets_threshold else 1)
if __name__ == "__main__":
main()