#!/usr/bin/env python3 """ TurboQuant Tool Call Regression Suite (Issue #96) Verifies that TurboQuant-compressed models still handle hermes tool calling correctly. Tests schema parsing, execution, and parallel tool calls. Usage: python3 tests/tool_call_regression.py \ --endpoint http://localhost:8081/v1 \ --model gemma-4 \ --kv-type turbo4 \ --runs 3 # Dry run (no server needed — validates schemas only): python3 tests/tool_call_regression.py --dry-run Acceptance: tool call accuracy must be >= 95% across all test cases. """ import argparse import json import os import re import sys import time from dataclasses import dataclass, field, asdict from datetime import datetime, timezone from typing import Optional # ── Tool schemas (hermes-compatible) ────────────────────────────── TOOL_SCHEMAS = [ { "type": "function", "function": { "name": "read_file", "description": "Read a text file with line numbers and pagination.", "parameters": { "type": "object", "properties": { "path": { "type": "string", "description": "File path to read (absolute or relative)" }, "offset": { "type": "integer", "description": "Line number to start reading from (1-indexed)", "default": 1 }, "limit": { "type": "integer", "description": "Maximum number of lines to return", "default": 500 } }, "required": ["path"] } } }, { "type": "function", "function": { "name": "web_search", "description": "Search the web for information using a query string.", "parameters": { "type": "object", "properties": { "query": { "type": "string", "description": "Search query" }, "num_results": { "type": "integer", "description": "Number of results to return", "default": 5 } }, "required": ["query"] } } }, { "type": "function", "function": { "name": "terminal", "description": "Execute a shell command on the system.", "parameters": { "type": "object", "properties": { "command": { "type": "string", "description": "Shell command to execute" }, "timeout": { "type": "integer", "description": "Timeout in seconds", "default": 30 } }, "required": ["command"] } } }, { "type": "function", "function": { "name": "execute_code", "description": "Run a Python script in a sandboxed environment.", "parameters": { "type": "object", "properties": { "code": { "type": "string", "description": "Python code to execute" } }, "required": ["code"] } } }, { "type": "function", "function": { "name": "delegate_task", "description": "Spawn a subagent to work on a task in an isolated context.", "parameters": { "type": "object", "properties": { "goal": { "type": "string", "description": "What the subagent should accomplish" }, "context": { "type": "string", "description": "Background information the subagent needs" }, "toolsets": { "type": "array", "items": {"type": "string"}, "description": "Toolsets to enable for this subagent" } }, "required": ["goal"] } } }, ] # ── Test prompts ────────────────────────────────────────────────── @dataclass class ToolCallTest: """A single test case for tool calling.""" id: str name: str prompt: str expected_tool: str expected_args: dict # subset of expected args description: str = "" TEST_CASES = [ ToolCallTest( id="read_file_basic", name="Read File — basic path", prompt="Read the file at /tmp/test.txt and show me the first 10 lines.", expected_tool="read_file", expected_args={"path": "/tmp/test.txt"}, description="Basic file read with path argument", ), ToolCallTest( id="read_file_offset", name="Read File — with offset", prompt="Read lines 50 through 80 of /var/log/system.log", expected_tool="read_file", expected_args={"path": "/var/log/system.log"}, description="File read with offset parameter", ), ToolCallTest( id="web_search_basic", name="Web Search — basic query", prompt="Search the web for 'TurboQuant KV cache compression benchmarks'", expected_tool="web_search", expected_args={"query": "turboquant"}, description="Web search with query containing keywords", ), ToolCallTest( id="terminal_basic", name="Terminal — simple command", prompt="Run `ls -la /tmp` to see what files are there.", expected_tool="terminal", expected_args={"command": "ls"}, description="Terminal command execution", ), ToolCallTest( id="terminal_complex", name="Terminal — complex command", prompt="Check the disk usage of the current directory with `du -sh .`", expected_tool="terminal", expected_args={"command": "du"}, description="Terminal with different command", ), ToolCallTest( id="code_exec_basic", name="Code Execution — python", prompt="Run this Python code: print(sum(range(100)))", expected_tool="execute_code", expected_args={"code": "sum"}, description="Code execution with Python", ), ToolCallTest( id="code_exec_complex", name="Code Execution — multi-line", prompt="Write and run Python code that reads a CSV file and counts the rows. Use the csv module.", expected_tool="execute_code", expected_args={"code": "csv"}, description="Code execution with multi-line Python", ), ToolCallTest( id="delegate_basic", name="Delegate Task — simple", prompt="Delegate this task to a subagent: research the latest llama.cpp release notes.", expected_tool="delegate_task", expected_args={"goal": "llama"}, description="Task delegation with goal", ), ToolCallTest( id="delegate_context", name="Delegate Task — with context", prompt="Spawn a subagent to review the Python files in /src. Context: look for security issues.", expected_tool="delegate_task", expected_args={"goal": "review"}, description="Task delegation with context", ), ToolCallTest( id="parallel_two", name="Parallel Tools — two in one response", prompt="Read the file /etc/hostname AND check the current date by running `date`. Do both at the same time.", expected_tool="read_file", # at least one of the two expected_args={"path": "/etc/hostname"}, description="Two tool calls in a single response", # Note: this test checks that at least 2 tool calls are returned ), ] # ── Result types ────────────────────────────────────────────────── @dataclass class TestResult: id: str name: str passed: bool tool_called: Optional[str] = None expected_tool: str = "" schema_valid: bool = False args_valid: bool = False latency_ms: float = 0.0 raw_response: str = "" error: Optional[str] = None @dataclass class SuiteResult: timestamp: str model: str endpoint: str kv_type: str total: int = 0 passed: int = 0 failed: int = 0 accuracy: float = 0.0 meets_threshold: bool = False threshold: float = 0.95 results: list = field(default_factory=list) error: Optional[str] = None # ── Schema validation ──────────────────────────────────────────── def validate_tool_call_schema(call: dict) -> bool: """Validate that a tool call response has the expected structure.""" if not isinstance(call, dict): return False # OpenAI format: { "type": "function", "function": { "name": "...", "arguments": "{}" } } if call.get("type") == "function": func = call.get("function", {}) return ( isinstance(func.get("name"), str) and len(func["name"]) > 0 and isinstance(func.get("arguments"), str) ) # Alternative format: { "name": "...", "arguments": "{}" } if "name" in call and "arguments" in call: return ( isinstance(call["name"], str) and len(call["name"]) > 0 and isinstance(call["arguments"], str) ) return False def validate_tool_args(args_str: str, expected: dict) -> bool: """Validate that tool arguments contain expected keys/values.""" try: args = json.loads(args_str) except (json.JSONDecodeError, TypeError): return False if not isinstance(args, dict): return False for key, value in expected.items(): if key not in args: return False # For string values, check substring match if isinstance(value, str) and isinstance(args[key], str): if value.lower() not in args[key].lower(): return False # For non-string values, check exact match elif args[key] != value: return False return True def extract_tool_calls(response: dict) -> list: """Extract tool calls from an API response.""" choices = response.get("choices", []) if not choices: return [] message = choices[0].get("message", {}) # Standard OpenAI format tool_calls = message.get("tool_calls", []) if tool_calls: return tool_calls # Some models return tool calls in content as JSON content = message.get("content", "") if content: # Try to parse content as JSON tool call try: parsed = json.loads(content) if isinstance(parsed, dict) and "name" in parsed and "arguments" in parsed: return [parsed] if isinstance(parsed, list): return [c for c in parsed if isinstance(c, dict) and "name" in c] except (json.JSONDecodeError, TypeError): # Look for JSON blocks in content json_match = re.search(r'\{[^{}]*"name"\s*:\s*"[^"]*"[^{}]*\}', content) if json_match: try: return [json.loads(json_match.group())] except json.JSONDecodeError: pass return [] # ── API interaction ─────────────────────────────────────────────── def call_model(endpoint: str, model: str, messages: list, tools: list, temperature: float = 0.1, timeout: int = 60) -> dict: """Call the model via OpenAI-compatible API.""" import urllib.request payload = json.dumps({ "model": model, "messages": messages, "tools": tools, "temperature": temperature, "max_tokens": 1024, }).encode() req = urllib.request.Request( f"{endpoint}/chat/completions", data=payload, headers={"Content-Type": "application/json"}, method="POST", ) start = time.time() try: resp = urllib.request.urlopen(req, timeout=timeout) data = json.loads(resp.read()) data["_latency_ms"] = (time.time() - start) * 1000 return data except Exception as e: return {"error": str(e), "_latency_ms": (time.time() - start) * 1000} # ── Test runner ─────────────────────────────────────────────────── def run_single_test(endpoint: str, model: str, test: ToolCallTest) -> TestResult: """Run a single tool call test.""" messages = [ { "role": "system", "content": ( "You are a helpful assistant. When the user asks you to perform " "a task, use the appropriate tool. Always call exactly one tool " "unless the user explicitly asks for multiple things." ), }, {"role": "user", "content": test.prompt}, ] response = call_model(endpoint, model, messages, TOOL_SCHEMAS) if "error" in response: return TestResult( id=test.id, name=test.name, passed=False, expected_tool=test.expected_tool, error=response["error"], latency_ms=response.get("_latency_ms", 0), ) tool_calls = extract_tool_calls(response) latency = response.get("_latency_ms", 0) if not tool_calls: # Model didn't call any tool content = response.get("choices", [{}])[0].get("message", {}).get("content", "") return TestResult( id=test.id, name=test.name, passed=False, expected_tool=test.expected_tool, latency_ms=latency, raw_response=content[:500], error="No tool call returned", ) # Validate first tool call call = tool_calls[0] schema_valid = validate_tool_call_schema(call) # Extract tool name if call.get("type") == "function": tool_name = call["function"]["name"] args_str = call["function"]["arguments"] else: tool_name = call.get("name", "") args_str = call.get("arguments", "{}") args_valid = validate_tool_args(args_str, test.expected_args) tool_correct = tool_name == test.expected_tool passed = tool_correct and schema_valid return TestResult( id=test.id, name=test.name, passed=passed, tool_called=tool_name, expected_tool=test.expected_tool, schema_valid=schema_valid, args_valid=args_valid, latency_ms=latency, raw_response=json.dumps(tool_calls[:2])[:500], ) def run_dry_run() -> SuiteResult: """Validate schemas and test structure without a running server.""" print("=== DRY RUN — Schema Validation Only ===\n") results = [] for test in TEST_CASES: # Validate schemas parse schema_valid = True for tool in TOOL_SCHEMAS: try: assert "type" in tool assert tool["type"] == "function" func = tool["function"] assert "name" in func assert "description" in func assert "parameters" in func params = func["parameters"] assert "type" in params assert "properties" in params except AssertionError: schema_valid = False results.append(TestResult( id=test.id, name=test.name, passed=schema_valid, expected_tool=test.expected_tool, schema_valid=schema_valid, args_valid=True, )) passed = sum(1 for r in results if r.passed) suite = SuiteResult( timestamp=datetime.now(timezone.utc).isoformat(), model="dry-run", endpoint="none", kv_type="none", total=len(results), passed=passed, failed=len(results) - passed, accuracy=passed / len(results) if results else 0, meets_threshold=passed == len(results), threshold=1.0, results=[asdict(r) for r in results], ) return suite def run_suite(endpoint: str, model: str, kv_type: str, runs: int = 1, threshold: float = 0.95) -> SuiteResult: """Run the full tool call regression suite.""" print(f"=== TurboQuant Tool Call Regression Suite ===") print(f"Endpoint: {endpoint}") print(f"Model: {model}") print(f"KV Type: {kv_type}") print(f"Runs: {runs}") print(f"Threshold: {threshold:.0%}") print() # Check server is reachable try: import urllib.request health_req = urllib.request.Request(f"{endpoint}/models", method="GET") urllib.request.urlopen(health_req, timeout=5) except Exception as e: return SuiteResult( timestamp=datetime.now(timezone.utc).isoformat(), model=model, endpoint=endpoint, kv_type=kv_type, error=f"Server unreachable: {e}", ) all_results = [] for run_idx in range(runs): if runs > 1: print(f"\n--- Run {run_idx + 1}/{runs} ---") for test in TEST_CASES: print(f" {test.id}: ", end="", flush=True) result = run_single_test(endpoint, model, test) status = "PASS" if result.passed else "FAIL" tool_info = f"called={result.tool_called}" if result.tool_called else "no tool" print(f"{status} ({tool_info}, {result.latency_ms:.0f}ms)") if result.error: print(f" Error: {result.error}") all_results.append(result) passed = sum(1 for r in all_results if r.passed) total = len(all_results) accuracy = passed / total if total > 0 else 0 suite = SuiteResult( timestamp=datetime.now(timezone.utc).isoformat(), model=model, endpoint=endpoint, kv_type=kv_type, total=total, passed=passed, failed=total - passed, accuracy=accuracy, meets_threshold=accuracy >= threshold, threshold=threshold, results=[asdict(r) for r in all_results], ) print(f"\n{'='*60}") print(f"RESULTS: {passed}/{total} passed ({accuracy:.1%})") print(f"Threshold: {threshold:.0%}") print(f"VERDICT: {'PASS' if suite.meets_threshold else 'FAIL'}") print(f"{'='*60}") return suite # ── Markdown report ─────────────────────────────────────────────── def generate_report(suite: SuiteResult, output_path: str) -> None: """Generate a markdown results matrix.""" lines = [ "# Tool Call Regression Results", "", f"**Generated:** {suite.timestamp}", f"**Model:** {suite.model}", f"**Endpoint:** {suite.endpoint}", f"**KV Type:** {suite.kv_type}", "", "## Summary", "", f"| Metric | Value |", f"|--------|-------|", f"| Total tests | {suite.total} |", f"| Passed | {suite.passed} |", f"| Failed | {suite.failed} |", f"| Accuracy | {suite.accuracy:.1%} |", f"| Threshold | {suite.threshold:.0%} |", f"| Verdict | {'PASS' if suite.meets_threshold else 'FAIL'} |", "", "## Test Matrix", "", "| Test ID | Tool Expected | Tool Called | Schema | Args | Latency | Status |", "|---------|--------------|-------------|--------|------|---------|--------|", ] for r in suite.results: d = r if isinstance(r, dict) else asdict(r) status = "PASS" if d["passed"] else "FAIL" schema = "OK" if d.get("schema_valid") else "FAIL" args = "OK" if d.get("args_valid") else "FAIL" called = d.get("tool_called") or "none" latency = f"{d.get('latency_ms', 0):.0f}ms" lines.append( f"| {d['id']} | {d['expected_tool']} | {called} | {schema} | {args} | {latency} | {status} |" ) if suite.error: lines.extend(["", "## Error", "", suite.error]) os.makedirs(os.path.dirname(output_path), exist_ok=True) with open(output_path, "w") as f: f.write("\n".join(lines) + "\n") print(f"\nReport saved to {output_path}") # ── Main ────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="TurboQuant Tool Call Regression Suite") parser.add_argument("--endpoint", default="http://localhost:8081/v1", help="llama.cpp OpenAI-compatible endpoint") parser.add_argument("--model", default="gemma-4", help="Model name") parser.add_argument("--kv-type", default="turbo4", help="KV cache type being tested") parser.add_argument("--runs", type=int, default=1, help="Number of runs per test") parser.add_argument("--threshold", type=float, default=0.95, help="Minimum accuracy to pass (0.0-1.0)") parser.add_argument("--output", default="benchmarks/tool-call-regression.md", help="Output markdown report path") parser.add_argument("--results-json", default="benchmarks/tool-call-regression.json", help="Output JSON results path") parser.add_argument("--dry-run", action="store_true", help="Validate schemas only, no server needed") args = parser.parse_args() if args.dry_run: suite = run_dry_run() else: suite = run_suite( endpoint=args.endpoint, model=args.model, kv_type=args.kv_type, runs=args.runs, threshold=args.threshold, ) # Save results generate_report(suite, args.output) os.makedirs(os.path.dirname(args.results_json), exist_ok=True) with open(args.results_json, "w") as f: json.dump(asdict(suite), f, indent=2) print(f"JSON results saved to {args.results_json}") # Exit code: 0 if passes threshold, 1 otherwise sys.exit(0 if suite.meets_threshold else 1) if __name__ == "__main__": main()