""" TurboQuant Compressed Model Tool Call Regression Suite — Issue #96 Run: pytest tests/tool_call_regression.py -v Generate matrix: pytest tests/tool_call_regression.py --generate-matrix """ import json import os import pathlib import re import time import unittest from typing import Dict import pytest ROOT = pathlib.Path(__file__).resolve().parents[1] BENCHMARKS_DIR = ROOT / "benchmarks" RESULTS_MATRIX = BENCHMARKS_DIR / "tool-call-regression.md" CORE_TOOLS = [ {"name": "read_file", "description": "Read a text file", "args": {"path": "/tmp/test.txt"}}, {"name": "web_search", "description": "Search the web", "args": {"query": "turboquant"}}, {"name": "terminal", "description": "Run a shell command", "args": {"command": "echo ok"}}, {"name": "execute_code", "description": "Run Python code", "args": {"code": "print(1)"}}, {"name": "delegate_task", "description": "Delegate to subagent", "args": {"goal": "test"}}, ] PARALLEL_TOOLS = [ {"name": "read_file", "args": {"path": "/tmp/a.txt"}}, {"name": "web_search", "args": {"query": "python"}}, {"name": "execute_code", "args": {"code": "x=1"}}, ] PASS_THRESHOLD = 0.95 class TestToolSchemaContract(unittest.TestCase): def test_core_tool_schemas_are_valid_functions(self): for tool in CORE_TOOLS: schema = { "type": "function", "function": { "name": tool["name"], "description": tool["description"], "parameters": { "type": "object", "properties": {}, "required": list(tool["args"].keys()), }, }, } parsed = json.loads(json.dumps(schema)) assert parsed["type"] == "function" fn = parsed["function"] assert fn["name"] == tool["name"] assert fn["description"] assert "parameters" in fn def test_parallel_tool_set_is_unique(self): names = [t["name"] for t in PARALLEL_TOOLS] assert len(names) == len(set(names)) def test_tool_call_response_format(self): tc = {"id": "call_abc", "type": "function", "function": {"name": "read_file", "arguments": json.dumps({"path": "/tmp/test.txt"})}} assert tc["type"] == "function" args = json.loads(tc["function"]["arguments"]) assert "path" in args def test_parallel_response_contains_multiple_calls(self): calls = [ {"id": "c1", "type": "function", "function": {"name": "read_file", "arguments": "{}"}}, {"id": "c2", "type": "function", "function": {"name": "web_search", "arguments": "{}"}}, {"id": "c3", "type": "function", "function": {"name": "execute_code","arguments": "{}"}}, ] assert len(calls) >= 3 call_names = {c["function"]["name"] for c in calls} assert len(call_names) >= 2 class TestProfileConfig(unittest.TestCase): @classmethod def setUpClass(cls): import yaml cls.profile = yaml.safe_load((ROOT / "profiles" / "hermes-profile-gemma4-turboquant.yaml").read_text()) def test_primary_provider_has_all_required_fields(self): """Provider must have model, endpoint, and turboquant config.""" p = self.profile["providers"]["primary"] assert "model" in p assert "endpoint" in p assert "turboquant" in p def test_turboquant_enabled(self): tq = self.profile["providers"]["primary"].get("turboquant", {}) assert tq.get("enabled") is True assert tq.get("kv_type") in ("turbo2", "turbo3", "turbo4") def test_server_command_has_turboquant_flags(self): cmd = self.profile["providers"]["primary"].get("server_command", "") assert "-ctk" in cmd and "-ctv" in cmd @pytest.mark.skipif( not os.environ.get("TURBOQUANT_SERVER_URL"), reason="Set TURBOQUANT_SERVER_URL to run live regression" ) class TestLiveRegression: RESULTS: Dict[str, bool] = {} def _call_model(self, tools, prompt, timeout=120): import requests url = os.environ["TURBOQUANT_SERVER_URL"] resp = requests.post( f"{url}/v1/chat/completions", json={"model": "gemma-4", "messages": [{"role": "user", "content": prompt}], "tools": tools, "tool_choice": "auto"}, timeout=timeout, ) resp.raise_for_status() return resp.json() def _has_valid_tool_call(self, data, expected_name): msg = data["choices"][0]["message"] for tc in msg.get("tool_calls", []): if tc["function"]["name"] == expected_name: json.loads(tc["function"]["arguments"]) return True return False def test_read_file(self): tools = [{"type":"function","function":{"name":"read_file","description":"Read file", "parameters":{"type":"object","properties":{"path":{"type":"string"}},"required":["path"]}}}] data = self._call_model(tools, "Read /tmp/test.txt") self.__class__.RESULTS["read_file"] = self._has_valid_tool_call(data, "read_file") def test_web_search(self): tools = [{"type":"function","function":{"name":"web_search","description":"Search", "parameters":{"type":"object","properties":{"query":{"type":"string"}},"required":["query"]}}}] data = self._call_model(tools, "Search for Python") self.__class__.RESULTS["web_search"] = self._has_valid_tool_call(data, "web_search") def test_terminal(self): tools = [{"type":"function","function":{"name":"terminal","description":"Shell", "parameters":{"type":"object","properties":{"command":{"type":"string"}},"required":["command"]}}}] data = self._call_model(tools, "List files") self.__class__.RESULTS["terminal"] = self._has_valid_tool_call(data, "terminal") def test_execute_code(self): tools = [{"type":"function","function":{"name":"execute_code","description":"Code", "parameters":{"type":"object","properties":{"code":{"type":"string"}},"required":["code"]}}}] data = self._call_model(tools, "Run: print('test')") self.__class__.RESULTS["execute_code"] = self._has_valid_tool_call(data, "execute_code") def test_delegate_task(self): tools = [{"type":"function","function":{"name":"delegate_task","description":"Delegate", "parameters":{"type":"object","properties":{"goal":{"type":"string"}},"required":["goal"]}}}] data = self._call_model(tools, "Delegate task: test") self.__class__.RESULTS["delegate_task"] = self._has_valid_tool_call(data, "delegate_task") def test_parallel_tool_calling(self): tools = [ {"type":"function","function":{"name":"read_file","description":"Read", "parameters":{"type":"object","properties":{"path":{"type":"string"}},"required":["path"]}},}, {"type":"function","function":{"name":"web_search","description":"Search", "parameters":{"type":"object","properties":{"query":{"type":"string"}},"required":["query"]}},}, {"type":"function","function":{"name":"execute_code","description":"Code", "parameters":{"type":"object","properties":{"code":{"type":"string"}},"required":["code"]}},}, ] data = self._call_model(tools, "Read a.txt, search python, run code") msg = data["choices"][0]["message"] calls = msg.get("tool_calls", []) names = {c["function"]["name"] for c in calls} self.__class__.RESULTS["parallel"] = len(names) >= 2 @classmethod def _accuracy(cls) -> float: if not cls.RESULTS: return 1.0 return sum(1 for v in cls.RESULTS.values() if v) / len(cls.RESULTS) @classmethod def teardown_class(cls): acc = cls._accuracy() print(f"\nTool Call Regression Accuracy: {acc*100:.1f}% (threshold {PASS_THRESHOLD*100:.0f}%)") for name, passed in cls.RESULTS.items(): print(f" {name}: {'PASS' if passed else 'FAIL'}") assert acc >= PASS_THRESHOLD, f"Accuracy {acc*100:.1f}% below {PASS_THRESHOLD*100:.0f}% gate" if os.environ.get("GENERATE_MATRIX"): _append_matrix(acc, cls.RESULTS) def _append_matrix(accuracy: float, results: Dict[str, bool]): timestamp = time.strftime("%Y-%m-%d %H:%M UTC", time.gmtime()) tool_names = [t["name"] for t in CORE_TOOLS] tool_checks = ["✓" if results.get(n, False) else "✗" for n in tool_names] parallel_check = "✓" if results.get("parallel") else "✗" row = f"| {timestamp} | gemma-4 | turbo4 | {accuracy*100:.1f}% | " + " | ".join(tool_checks) + f" | {parallel_check} |\n" header = ( "| Timestamp | Model | Preset | Accuracy | " + " | ".join(tool_names) + " | Parallel |\n" "|-----------|-------|--------|----------|" + "---|" * (len(tool_names) + 1) + "\n" ) if not RESULTS_MATRIX.exists(): RESULTS_MATRIX.write_text(header + row) else: content = RESULTS_MATRIX.read_text() if header not in content: content = header + row + content else: content = header + row + content.split(header, 1)[1] RESULTS_MATRIX.write_text(content) print(f"Matrix updated: {RESULTS_MATRIX}") def pytest_addoption(parser): parser.addoption("--generate-matrix", action="store_true", help="Update benchmarks/tool-call-regression.md with live results") def pytest_configure(config): if config.getoption("--generate-matrix"): os.environ["GENERATE_MATRIX"] = "1"