diff --git a/.gitea/workflows/smoke.yml b/.gitea/workflows/smoke.yml index 01e3d860..15e922fb 100644 --- a/.gitea/workflows/smoke.yml +++ b/.gitea/workflows/smoke.yml @@ -29,6 +29,10 @@ jobs: run: | if grep -rE 'sk-or-|sk-ant-|ghp_|AKIA' . --include='*.yml' --include='*.py' --include='*.sh' 2>/dev/null | grep -v .gitea | grep -v llama-cpp-fork; then exit 1; fi echo "PASS: No secrets" + - name: Tool call regression suite (issue #96) + run: | + python3 -m pip install -q pytest pyyaml requests + pytest tests/tool_call_regression.py -v --tb=short - name: Markdown link check run: | python3 check_markdown_links.py diff --git a/benchmarks/tool-call-regression.md b/benchmarks/tool-call-regression.md new file mode 100644 index 00000000..84449c72 --- /dev/null +++ b/benchmarks/tool-call-regression.md @@ -0,0 +1,2 @@ +| Timestamp | Model | Preset | Accuracy | read_file | web_search | terminal | execute_code | delegate_task | Parallel | +|-----------|-------|--------|----------|-----------|------------|----------|--------------|---------------|----------| diff --git a/tests/tool_call_regression.py b/tests/tool_call_regression.py new file mode 100644 index 00000000..2f4086be --- /dev/null +++ b/tests/tool_call_regression.py @@ -0,0 +1,225 @@ +""" +TurboQuant Compressed Model Tool Call Regression Suite — Issue #96 + +Run: pytest tests/tool_call_regression.py -v +Generate matrix: pytest tests/tool_call_regression.py --generate-matrix +""" +import json +import os +import pathlib +import re +import time +import unittest +from typing import Dict + +import pytest + +ROOT = pathlib.Path(__file__).resolve().parents[1] +BENCHMARKS_DIR = ROOT / "benchmarks" +RESULTS_MATRIX = BENCHMARKS_DIR / "tool-call-regression.md" + +CORE_TOOLS = [ + {"name": "read_file", "description": "Read a text file", "args": {"path": "/tmp/test.txt"}}, + {"name": "web_search", "description": "Search the web", "args": {"query": "turboquant"}}, + {"name": "terminal", "description": "Run a shell command", "args": {"command": "echo ok"}}, + {"name": "execute_code", "description": "Run Python code", "args": {"code": "print(1)"}}, + {"name": "delegate_task", "description": "Delegate to subagent", "args": {"goal": "test"}}, +] + +PARALLEL_TOOLS = [ + {"name": "read_file", "args": {"path": "/tmp/a.txt"}}, + {"name": "web_search", "args": {"query": "python"}}, + {"name": "execute_code", "args": {"code": "x=1"}}, +] + +PASS_THRESHOLD = 0.95 + + +class TestToolSchemaContract(unittest.TestCase): + def test_core_tool_schemas_are_valid_functions(self): + for tool in CORE_TOOLS: + schema = { + "type": "function", + "function": { + "name": tool["name"], + "description": tool["description"], + "parameters": { + "type": "object", + "properties": {}, + "required": list(tool["args"].keys()), + }, + }, + } + parsed = json.loads(json.dumps(schema)) + assert parsed["type"] == "function" + fn = parsed["function"] + assert fn["name"] == tool["name"] + assert fn["description"] + assert "parameters" in fn + + def test_parallel_tool_set_is_unique(self): + names = [t["name"] for t in PARALLEL_TOOLS] + assert len(names) == len(set(names)) + + def test_tool_call_response_format(self): + tc = {"id": "call_abc", "type": "function", + "function": {"name": "read_file", "arguments": json.dumps({"path": "/tmp/test.txt"})}} + assert tc["type"] == "function" + args = json.loads(tc["function"]["arguments"]) + assert "path" in args + + def test_parallel_response_contains_multiple_calls(self): + calls = [ + {"id": "c1", "type": "function", "function": {"name": "read_file", "arguments": "{}"}}, + {"id": "c2", "type": "function", "function": {"name": "web_search", "arguments": "{}"}}, + {"id": "c3", "type": "function", "function": {"name": "execute_code","arguments": "{}"}}, + ] + assert len(calls) >= 3 + call_names = {c["function"]["name"] for c in calls} + assert len(call_names) >= 2 + + +class TestProfileConfig(unittest.TestCase): + @classmethod + def setUpClass(cls): + import yaml + cls.profile = yaml.safe_load((ROOT / "profiles" / "hermes-profile-gemma4-turboquant.yaml").read_text()) + + def test_primary_provider_has_all_required_fields(self): + """Provider must have model, endpoint, and turboquant config.""" + p = self.profile["providers"]["primary"] + assert "model" in p + assert "endpoint" in p + assert "turboquant" in p + def test_turboquant_enabled(self): + tq = self.profile["providers"]["primary"].get("turboquant", {}) + assert tq.get("enabled") is True + assert tq.get("kv_type") in ("turbo2", "turbo3", "turbo4") + + def test_server_command_has_turboquant_flags(self): + cmd = self.profile["providers"]["primary"].get("server_command", "") + assert "-ctk" in cmd and "-ctv" in cmd + + +@pytest.mark.skipif( + not os.environ.get("TURBOQUANT_SERVER_URL"), + reason="Set TURBOQUANT_SERVER_URL to run live regression" +) +class TestLiveRegression: + RESULTS: Dict[str, bool] = {} + + def _call_model(self, tools, prompt, timeout=120): + import requests + url = os.environ["TURBOQUANT_SERVER_URL"] + resp = requests.post( + f"{url}/v1/chat/completions", + json={"model": "gemma-4", "messages": [{"role": "user", "content": prompt}], + "tools": tools, "tool_choice": "auto"}, + timeout=timeout, + ) + resp.raise_for_status() + return resp.json() + + def _has_valid_tool_call(self, data, expected_name): + msg = data["choices"][0]["message"] + for tc in msg.get("tool_calls", []): + if tc["function"]["name"] == expected_name: + json.loads(tc["function"]["arguments"]) + return True + return False + + def test_read_file(self): + tools = [{"type":"function","function":{"name":"read_file","description":"Read file", + "parameters":{"type":"object","properties":{"path":{"type":"string"}},"required":["path"]}}}] + data = self._call_model(tools, "Read /tmp/test.txt") + self.__class__.RESULTS["read_file"] = self._has_valid_tool_call(data, "read_file") + + def test_web_search(self): + tools = [{"type":"function","function":{"name":"web_search","description":"Search", + "parameters":{"type":"object","properties":{"query":{"type":"string"}},"required":["query"]}}}] + data = self._call_model(tools, "Search for Python") + self.__class__.RESULTS["web_search"] = self._has_valid_tool_call(data, "web_search") + + def test_terminal(self): + tools = [{"type":"function","function":{"name":"terminal","description":"Shell", + "parameters":{"type":"object","properties":{"command":{"type":"string"}},"required":["command"]}}}] + data = self._call_model(tools, "List files") + self.__class__.RESULTS["terminal"] = self._has_valid_tool_call(data, "terminal") + + def test_execute_code(self): + tools = [{"type":"function","function":{"name":"execute_code","description":"Code", + "parameters":{"type":"object","properties":{"code":{"type":"string"}},"required":["code"]}}}] + data = self._call_model(tools, "Run: print('test')") + self.__class__.RESULTS["execute_code"] = self._has_valid_tool_call(data, "execute_code") + + def test_delegate_task(self): + tools = [{"type":"function","function":{"name":"delegate_task","description":"Delegate", + "parameters":{"type":"object","properties":{"goal":{"type":"string"}},"required":["goal"]}}}] + data = self._call_model(tools, "Delegate task: test") + self.__class__.RESULTS["delegate_task"] = self._has_valid_tool_call(data, "delegate_task") + + def test_parallel_tool_calling(self): + tools = [ + {"type":"function","function":{"name":"read_file","description":"Read", + "parameters":{"type":"object","properties":{"path":{"type":"string"}},"required":["path"]}},}, + {"type":"function","function":{"name":"web_search","description":"Search", + "parameters":{"type":"object","properties":{"query":{"type":"string"}},"required":["query"]}},}, + {"type":"function","function":{"name":"execute_code","description":"Code", + "parameters":{"type":"object","properties":{"code":{"type":"string"}},"required":["code"]}},}, + ] + data = self._call_model(tools, "Read a.txt, search python, run code") + msg = data["choices"][0]["message"] + calls = msg.get("tool_calls", []) + names = {c["function"]["name"] for c in calls} + self.__class__.RESULTS["parallel"] = len(names) >= 2 + + @classmethod + def _accuracy(cls) -> float: + if not cls.RESULTS: + return 1.0 + return sum(1 for v in cls.RESULTS.values() if v) / len(cls.RESULTS) + + @classmethod + def teardown_class(cls): + acc = cls._accuracy() + print(f"\nTool Call Regression Accuracy: {acc*100:.1f}% (threshold {PASS_THRESHOLD*100:.0f}%)") + for name, passed in cls.RESULTS.items(): + print(f" {name}: {'PASS' if passed else 'FAIL'}") + assert acc >= PASS_THRESHOLD, f"Accuracy {acc*100:.1f}% below {PASS_THRESHOLD*100:.0f}% gate" + if os.environ.get("GENERATE_MATRIX"): + _append_matrix(acc, cls.RESULTS) + + +def _append_matrix(accuracy: float, results: Dict[str, bool]): + timestamp = time.strftime("%Y-%m-%d %H:%M UTC", time.gmtime()) + tool_names = [t["name"] for t in CORE_TOOLS] + tool_checks = ["✓" if results.get(n, False) else "✗" for n in tool_names] + parallel_check = "✓" if results.get("parallel") else "✗" + row = f"| {timestamp} | gemma-4 | turbo4 | {accuracy*100:.1f}% | " + " | ".join(tool_checks) + f" | {parallel_check} |\n" + header = ( + "| Timestamp | Model | Preset | Accuracy | " + + " | ".join(tool_names) + + " | Parallel |\n" + "|-----------|-------|--------|----------|" + + "---|" * (len(tool_names) + 1) + "\n" + ) + if not RESULTS_MATRIX.exists(): + RESULTS_MATRIX.write_text(header + row) + else: + content = RESULTS_MATRIX.read_text() + if header not in content: + content = header + row + content + else: + content = header + row + content.split(header, 1)[1] + RESULTS_MATRIX.write_text(content) + print(f"Matrix updated: {RESULTS_MATRIX}") + + +def pytest_addoption(parser): + parser.addoption("--generate-matrix", action="store_true", + help="Update benchmarks/tool-call-regression.md with live results") + + +def pytest_configure(config): + if config.getoption("--generate-matrix"): + os.environ["GENERATE_MATRIX"] = "1"