diff --git a/docs/model-benchmarks.md b/docs/model-benchmarks.md new file mode 100644 index 00000000..fa9f99e4 --- /dev/null +++ b/docs/model-benchmarks.md @@ -0,0 +1,1244 @@ +# Model Benchmark Results + +> Generated: 2026-03-24 01:28 UTC +> Ollama URL: `http://localhost:11434` +> Issue: [#1066](http://143.198.27.163:3000/rockachopa/Timmy-time-dashboard/issues/1066) + +## Overview + +This report documents the 5-test benchmark suite results for local model candidates. + +### Model Availability vs. Spec + +| Requested | Tested Substitute | Reason | +|-----------|-------------------|--------| +| `qwen3:14b` | `qwen2.5:14b` | `qwen3:14b` not pulled locally | +| `qwen3:8b` | `qwen3.5:latest` | `qwen3:8b` not pulled locally | +| `hermes3:8b` | `hermes3:8b` | Exact match | +| `dolphin3` | `llama3.2:latest` | `dolphin3` not pulled locally | + +## Summary Comparison Table + +| Model | Passed | Tool Calling | Code Gen | Shell Gen | Coherence | Triage Acc | Time (s) | +|-------|--------|-------------|----------|-----------|-----------|------------|----------| +| `hermes3:8b` | 3/5 | 100% | PASS | PASS | 20% | 60% | 72.8 | +| `qwen3.5:latest` | 1/5 | 30% | FAIL | FAIL | 100% | 0% | 309.7 | +| `qwen2.5:14b` | 4/5 | 100% | PASS | PASS | 100% | 60% | 105.7 | +| `llama3.2:latest` | 3/5 | 20% | PASS | PASS | 100% | 20% | 45.8 | + +## Per-Model Detail + +### `hermes3:8b` + +#### Benchmark 1: Tool Calling Compliance — ✅ PASS + +- **JSON Compliance:** 10/10 (100%) — target ≥90% +- **Time:** 9.84s + +#### Benchmark 2: Code Generation Correctness — ✅ PASS + +- **Result:** fibonacci(10) = 55 ✓ +- **Generated code snippet:** + ```python + def fibonacci(n): + if n == 0: + return 0 + elif n == 1: + return 1 + else: + return fibonacci(n - 1) + fibonacci(n - 2) + ``` +- **Time:** 1.14s + +#### Benchmark 3: Shell Command Generation — ✅ PASS + +- **Passed:** 5/5 — **Refusals:** 0 +- **Time:** 32.22s + +#### Benchmark 4: Multi-Turn Coherence — ❌ FAIL + +- **Coherent turns:** 1/5 (20%) — target ≥80% +- **Time:** 24.59s + +#### Benchmark 5: Issue Triage Quality — ❌ FAIL + +- **Accuracy:** 3/5 (60%) — target ≥80% +- **Time:** 5.06s + +### `qwen3.5:latest` + +#### Benchmark 1: Tool Calling Compliance — ❌ FAIL + +- **JSON Compliance:** 3/10 (30%) — target ≥90% +- **Time:** 85.86s + +#### Benchmark 2: Code Generation Correctness — ❌ FAIL + +- **Result:** Runtime error: Traceback (most recent call last): + File "/var/folders/9k/v07xkpp133v03yynn9nx80fr0000gn/T/tmpddxkrf6i.py", line 3, in + result = fibonacci(10) + ^^^^^^^^^ +NameError: name 'fibo +- **Time:** 18.68s + +#### Benchmark 3: Shell Command Generation — ❌ FAIL + +- **Passed:** 4/5 — **Refusals:** 0 +- **Time:** 85.54s + +#### Benchmark 4: Multi-Turn Coherence — ✅ PASS + +- **Coherent turns:** 5/5 (100%) — target ≥80% +- **Time:** 67.17s + +#### Benchmark 5: Issue Triage Quality — ❌ FAIL + +- **Accuracy:** 0/5 (0%) — target ≥80% +- **Time:** 52.42s + +### `qwen2.5:14b` + +#### Benchmark 1: Tool Calling Compliance — ✅ PASS + +- **JSON Compliance:** 10/10 (100%) — target ≥90% +- **Time:** 19.59s + +#### Benchmark 2: Code Generation Correctness — ✅ PASS + +- **Result:** fibonacci(10) = 55 ✓ +- **Generated code snippet:** + ```python + def fibonacci(n): + if n == 0: + return 0 + elif n == 1: + return 1 + else: + a, b = 0, 1 + for _ in range(2, n + 1): + ``` +- **Time:** 2.81s + +#### Benchmark 3: Shell Command Generation — ✅ PASS + +- **Passed:** 5/5 — **Refusals:** 0 +- **Time:** 48.7s + +#### Benchmark 4: Multi-Turn Coherence — ✅ PASS + +- **Coherent turns:** 5/5 (100%) — target ≥80% +- **Time:** 26.79s + +#### Benchmark 5: Issue Triage Quality — ❌ FAIL + +- **Accuracy:** 3/5 (60%) — target ≥80% +- **Time:** 7.76s + +### `llama3.2:latest` + +#### Benchmark 1: Tool Calling Compliance — ❌ FAIL + +- **JSON Compliance:** 2/10 (20%) — target ≥90% +- **Time:** 23.85s + +#### Benchmark 2: Code Generation Correctness — ✅ PASS + +- **Result:** fibonacci(10) = 55 ✓ +- **Generated code snippet:** + ```python + def fibonacci(n): + if n <= 1: + return n + a, b = 0, 1 + for _ in range(2, n + 1): + a, b = b, a + b + return b + ``` +- **Time:** 0.69s + +#### Benchmark 3: Shell Command Generation — ✅ PASS + +- **Passed:** 5/5 — **Refusals:** 0 +- **Time:** 10.91s + +#### Benchmark 4: Multi-Turn Coherence — ✅ PASS + +- **Coherent turns:** 5/5 (100%) — target ≥80% +- **Time:** 7.47s + +#### Benchmark 5: Issue Triage Quality — ❌ FAIL + +- **Accuracy:** 1/5 (20%) — target ≥80% +- **Time:** 2.9s + +## Raw JSON Data + +
+Click to expand full JSON results + +```json +{ + "hermes3:8b": { + "01_tool_calling": { + "benchmark": "tool_calling", + "model": "hermes3:8b", + "total_prompts": 10, + "valid_json_count": 10, + "compliance_rate": 1.0, + "passed": true, + "total_time_s": 9.84, + "results": [ + { + "prompt_id": 1, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 3.23, + "response_snippet": "{\n \"tool\": \"get_weather\",\n \"args\": \"San Francisco\"\n}" + }, + { + "prompt_id": 2, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 0.58, + "response_snippet": "{\n \"tool\": \"read_file\",\n \"args\": [\n \"/etc/hosts\"\n ]\n}" + }, + { + "prompt_id": 3, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 0.51, + "response_snippet": "{\n \"tool\": \"search_web\",\n \"args\": \"latest Python release\"\n}" + }, + { + "prompt_id": 4, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 0.81, + "response_snippet": "{\n \"tool\": \"Jira\",\n \"args\": {\n \"title\": \"Fix login bug\",\n \"priority\": \"high\"\n }\n}" + }, + { + "prompt_id": 5, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 0.53, + "response_snippet": "{\n \"tool\": \"list_directory\",\n \"args\": [\"/home/user/projects\"]\n}" + }, + { + "prompt_id": 6, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 0.76, + "response_snippet": "{\n \"tool\": \"send_notification\",\n \"args\": {\n \"message\": \"Deploy complete\",\n \"channel\": \"slack\"\n }\n}" + }, + { + "prompt_id": 7, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 0.69, + "response_snippet": "{\n \"tool\": \"database_query\",\n \"args\": {\n \"sql\": \"SELECT COUNT(*) FROM users\"\n }\n}" + }, + { + "prompt_id": 8, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 0.77, + "response_snippet": "{\n \"tool\": \"get_git_log\",\n \"args\": [\n \"limit=10\",\n \"branch='main'\"\n ]\n}" + }, + { + "prompt_id": 9, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 0.94, + "response_snippet": "{\n \"tool\": \"schedule_task\",\n \"args\": {\n \"cron\": \"0 9 * * MON-FRI\",\n \"task\": \"generate_report\"\n }\n}" + }, + { + "prompt_id": 10, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 1.02, + "response_snippet": "{\n \"tool\": \"resize_image\",\n \"args\": {\n \"url\": \"https://example.com/photo.jpg\",\n \"width\": 800,\n \"height\": 600\n" + } + ] + }, + "02_code_generation": { + "benchmark": "code_generation", + "model": "hermes3:8b", + "passed": true, + "detail": "fibonacci(10) = 55 \u2713", + "code_snippet": "def fibonacci(n):\n if n == 0:\n return 0\n elif n == 1:\n return 1\n else:\n return fibonacci(n - 1) + fibonacci(n - 2)", + "elapsed_s": 1.14 + }, + "03_shell_commands": { + "benchmark": "shell_commands", + "model": "hermes3:8b", + "total_prompts": 5, + "passed_count": 5, + "refused_count": 0, + "pass_rate": 1.0, + "passed": true, + "total_time_s": 32.22, + "results": [ + { + "prompt_id": 1, + "description": "find and delete old logs", + "refused": false, + "has_shell_command": true, + "passed": true, + "elapsed_s": 5.33, + "response_snippet": "Here's a bash one-liner to find all .log files older than 7 days and delete them:\n\nfind /path/to/search -type f -name \"*" + }, + { + "prompt_id": 2, + "description": "chmod recursive", + "refused": false, + "has_shell_command": true, + "passed": true, + "elapsed_s": 5.56, + "response_snippet": "To recursively change the permissions of all .sh files to be executable, you can use the following bash command:\n\n```bas" + }, + { + "prompt_id": 3, + "description": "kill processes by name", + "refused": false, + "has_shell_command": true, + "passed": true, + "elapsed_s": 4.89, + "response_snippet": "To kill all processes matching the name 'python3', you can use the following shell command:\n\n```bash\npkill -f python3\n``" + }, + { + "prompt_id": 4, + "description": "disk usage alert", + "refused": false, + "has_shell_command": true, + "passed": true, + "elapsed_s": 10.08, + "response_snippet": "Here's a bash script snippet that monitors disk usage and sends an alert if the usage exceeds 90%:\n\n```bash\n#!/bin/bash\n" + }, + { + "prompt_id": 5, + "description": "tar compress logs", + "refused": false, + "has_shell_command": true, + "passed": true, + "elapsed_s": 6.36, + "response_snippet": "To create a compressed tar archive of the /var/log directory, you can use the following bash command:\n\n```\ntar -czvf /pa" + } + ] + }, + "04_multi_turn_coherence": { + "benchmark": "multi_turn_coherence", + "model": "hermes3:8b", + "total_turns": 5, + "coherent_turns": 1, + "coherence_rate": 0.2, + "passed": false, + "total_time_s": 24.59, + "turns": [ + { + "turn": 1, + "valid_json": true, + "has_required_keys": true, + "coherent": true, + "confidence": 0.9, + "elapsed_s": 4.14, + "response_snippet": "{\n \"observation\": \"The CPU usage on the monitored web server has spiked to 95%, which is significantly higher than the normal operating range. This high CPU usage could indicate that the server is ex" + }, + { + "turn": 2, + "valid_json": false, + "has_required_keys": false, + "coherent": false, + "confidence": null, + "elapsed_s": 5.02, + "response_snippet": "{\n \"observation\": \"After investigating the server logs and performance metrics, it was found that the CPU spike was caused by three runaway Python processes, each consuming approximately 30% of the C" + }, + { + "turn": 3, + "valid_json": false, + "has_required_keys": false, + "coherent": false, + "confidence": null, + "elapsed_s": 5.31, + "response_snippet": "{\n \"observation\": \"After terminating the top two runaway Python processes, the CPU usage has decreased from 95% to 45%. However, a new alert has been triggered, indicating that the disk I/O usage has" + }, + { + "turn": 4, + "valid_json": false, + "has_required_keys": false, + "coherent": false, + "confidence": null, + "elapsed_s": 5.37, + "response_snippet": "{\n \"observation\": \"After investigating the high disk I/O usage, it was determined that the issue was caused by a log rotation script that had become stuck, resulting in excessive disk write operation" + }, + { + "turn": 5, + "valid_json": false, + "has_required_keys": false, + "coherent": false, + "confidence": null, + "elapsed_s": 4.75, + "response_snippet": "{\n \"observation\": \"The incident involving high CPU usage and disk I/O has been successfully resolved. The CPU spike was caused by three runaway Python processes, which were terminated, and the disk I" + } + ] + }, + "05_issue_triage": { + "benchmark": "issue_triage", + "model": "hermes3:8b", + "total_issues": 5, + "exact_matches": 3, + "accuracy": 0.6, + "passed": false, + "total_time_s": 5.06, + "results": [ + { + "issue_id": 1, + "title": "Production database is returning 500 errors on all queries", + "expected": "p0-critical", + "assigned": "p0-critical", + "exact_match": true, + "off_by_one": false, + "valid_json": true, + "elapsed_s": 1.18 + }, + { + "issue_id": 2, + "title": "Login page takes 8 seconds to load", + "expected": "p1-high", + "assigned": "p1-high", + "exact_match": true, + "off_by_one": false, + "valid_json": true, + "elapsed_s": 0.9 + }, + { + "issue_id": 3, + "title": "Add dark mode support to settings page", + "expected": "p3-low", + "assigned": "p2-medium", + "exact_match": false, + "off_by_one": true, + "valid_json": true, + "elapsed_s": 0.96 + }, + { + "issue_id": 4, + "title": "Email notifications sometimes arrive 10 minutes late", + "expected": "p2-medium", + "assigned": "p2-medium", + "exact_match": true, + "off_by_one": false, + "valid_json": true, + "elapsed_s": 1.09 + }, + { + "issue_id": 5, + "title": "Security vulnerability: SQL injection possible in search end", + "expected": "p0-critical", + "assigned": "p1-high", + "exact_match": false, + "off_by_one": true, + "valid_json": true, + "elapsed_s": 0.94 + } + ] + } + }, + "qwen3.5:latest": { + "01_tool_calling": { + "benchmark": "tool_calling", + "model": "qwen3.5:latest", + "total_prompts": 10, + "valid_json_count": 3, + "compliance_rate": 0.3, + "passed": false, + "total_time_s": 85.86, + "results": [ + { + "prompt_id": 1, + "valid_json": false, + "has_expected_keys": false, + "elapsed_s": 13.22, + "response_snippet": "" + }, + { + "prompt_id": 2, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 8.07, + "response_snippet": "{\n \"tool\": \"read_file\",\n \"args\": {\n \"path\": \"/etc/hosts\"\n }\n}" + }, + { + "prompt_id": 3, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 3.49, + "response_snippet": "```json\n{\n \"tool\": \"search_web\",\n \"args\": {\n \"query\": \"latest Python release\"\n }\n}\n```" + }, + { + "prompt_id": 4, + "valid_json": false, + "has_expected_keys": false, + "elapsed_s": 9.36, + "response_snippet": "" + }, + { + "prompt_id": 5, + "valid_json": false, + "has_expected_keys": false, + "elapsed_s": 9.18, + "response_snippet": "" + }, + { + "prompt_id": 6, + "valid_json": false, + "has_expected_keys": false, + "elapsed_s": 9.1, + "response_snippet": "" + }, + { + "prompt_id": 7, + "valid_json": false, + "has_expected_keys": false, + "elapsed_s": 9.14, + "response_snippet": "" + }, + { + "prompt_id": 8, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 5.27, + "response_snippet": "{\n \"tool\": \"get_git_log\",\n \"args\": {\n \"limit\": 10,\n \"branch\": \"main\"\n }\n}" + }, + { + "prompt_id": 9, + "valid_json": false, + "has_expected_keys": false, + "elapsed_s": 9.82, + "response_snippet": "" + }, + { + "prompt_id": 10, + "valid_json": false, + "has_expected_keys": false, + "elapsed_s": 9.19, + "response_snippet": "" + } + ] + }, + "02_code_generation": { + "benchmark": "code_generation", + "model": "qwen3.5:latest", + "passed": false, + "detail": "Runtime error: Traceback (most recent call last):\n File \"/var/folders/9k/v07xkpp133v03yynn9nx80fr0000gn/T/tmpddxkrf6i.py\", line 3, in \n result = fibonacci(10)\n ^^^^^^^^^\nNameError: name 'fibo", + "code_snippet": "", + "elapsed_s": 18.68 + }, + "03_shell_commands": { + "benchmark": "shell_commands", + "model": "qwen3.5:latest", + "total_prompts": 5, + "passed_count": 4, + "refused_count": 0, + "pass_rate": 0.8, + "passed": false, + "total_time_s": 85.54, + "results": [ + { + "prompt_id": 1, + "description": "find and delete old logs", + "refused": false, + "has_shell_command": false, + "passed": false, + "elapsed_s": 19.16, + "response_snippet": "" + }, + { + "prompt_id": 2, + "description": "chmod recursive", + "refused": false, + "has_shell_command": true, + "passed": true, + "elapsed_s": 18.12, + "response_snippet": "To recursively change the permissions of all `.sh` files in the current directory and its subdirectories to executable, " + }, + { + "prompt_id": 3, + "description": "kill processes by name", + "refused": false, + "has_shell_command": true, + "passed": true, + "elapsed_s": 13.06, + "response_snippet": "To kill all processes matching the name `python3`, use one of the following commands:\n\n### Recommended (Simplest)\n```bas" + }, + { + "prompt_id": 4, + "description": "disk usage alert", + "refused": false, + "has_shell_command": true, + "passed": true, + "elapsed_s": 18.2, + "response_snippet": "Here's a robust and production-ready Bash script snippet that monitors disk usage and sends an alert if usage exceeds 90" + }, + { + "prompt_id": 5, + "description": "tar compress logs", + "refused": false, + "has_shell_command": true, + "passed": true, + "elapsed_s": 17.01, + "response_snippet": "To create a compressed tar archive of the `/var/log` directory using gzip (the most common compression method), use the " + } + ] + }, + "04_multi_turn_coherence": { + "benchmark": "multi_turn_coherence", + "model": "qwen3.5:latest", + "total_turns": 5, + "coherent_turns": 5, + "coherence_rate": 1.0, + "passed": true, + "total_time_s": 67.17, + "turns": [ + { + "turn": 1, + "valid_json": true, + "has_required_keys": true, + "coherent": true, + "confidence": 0.85, + "elapsed_s": 13.14, + "response_snippet": "{\n \"observation\": \"Web server CPU usage has spiked to 95%, indicating potential performance issues that could affect service availability and user experience\",\n \"reasoning\": \"High CPU usage requires" + }, + { + "turn": 2, + "valid_json": true, + "has_required_keys": true, + "coherent": true, + "confidence": 0.92, + "elapsed_s": 12.84, + "response_snippet": "{\n \"observation\": \"Three Python processes identified as consuming excessive CPU resources (30% each, totaling ~90%), confirming the root cause of the CPU spike\",\n \"reasoning\": \"These runaway process" + }, + { + "turn": 3, + "valid_json": true, + "has_required_keys": true, + "coherent": true, + "confidence": 0.88, + "elapsed_s": 15.74, + "response_snippet": "{\n \"observation\": \"Disk I/O has spiked to 98% after CPU reduction, indicating a new bottleneck that could severely impact database performance and application responsiveness\",\n \"reasoning\": \"High di" + }, + { + "turn": 4, + "valid_json": true, + "has_required_keys": true, + "coherent": true, + "confidence": 0.95, + "elapsed_s": 13.42, + "response_snippet": "{\n \"observation\": \"All server metrics have returned to nominal levels after terminating the runaway Python processes and fixing the stuck log rotation script\",\n \"reasoning\": \"The incident has been r" + }, + { + "turn": 5, + "valid_json": true, + "has_required_keys": true, + "coherent": true, + "confidence": 0.98, + "elapsed_s": 12.03, + "response_snippet": "{\n \"observation\": \"Incident fully resolved - all metrics nominal, server stable\",\n \"reasoning\": \"Now documenting the incident for organizational learning and prevention. The post-mortem should captu" + } + ] + }, + "05_issue_triage": { + "benchmark": "issue_triage", + "model": "qwen3.5:latest", + "total_issues": 5, + "exact_matches": 0, + "accuracy": 0.0, + "passed": false, + "total_time_s": 52.42, + "results": [ + { + "issue_id": 1, + "title": "Production database is returning 500 errors on all queries", + "expected": "p0-critical", + "assigned": null, + "exact_match": false, + "off_by_one": false, + "valid_json": false, + "elapsed_s": 10.36 + }, + { + "issue_id": 2, + "title": "Login page takes 8 seconds to load", + "expected": "p1-high", + "assigned": null, + "exact_match": false, + "off_by_one": false, + "valid_json": false, + "elapsed_s": 10.25 + }, + { + "issue_id": 3, + "title": "Add dark mode support to settings page", + "expected": "p3-low", + "assigned": null, + "exact_match": false, + "off_by_one": false, + "valid_json": false, + "elapsed_s": 10.05 + }, + { + "issue_id": 4, + "title": "Email notifications sometimes arrive 10 minutes late", + "expected": "p2-medium", + "assigned": null, + "exact_match": false, + "off_by_one": false, + "valid_json": false, + "elapsed_s": 10.99 + }, + { + "issue_id": 5, + "title": "Security vulnerability: SQL injection possible in search end", + "expected": "p0-critical", + "assigned": null, + "exact_match": false, + "off_by_one": false, + "valid_json": false, + "elapsed_s": 10.77 + } + ] + } + }, + "qwen2.5:14b": { + "01_tool_calling": { + "benchmark": "tool_calling", + "model": "qwen2.5:14b", + "total_prompts": 10, + "valid_json_count": 10, + "compliance_rate": 1.0, + "passed": true, + "total_time_s": 19.59, + "results": [ + { + "prompt_id": 1, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 6.87, + "response_snippet": "{\n \"tool\": \"get_weather\",\n \"args\": {\n \"location\": \"San Francisco\"\n }\n}" + }, + { + "prompt_id": 2, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 1.18, + "response_snippet": "{\n \"tool\": \"read_file\",\n \"args\": {\n \"path\": \"/etc/hosts\"\n }\n}" + }, + { + "prompt_id": 3, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 1.19, + "response_snippet": "{\n \"tool\": \"search_web\",\n \"args\": {\n \"query\": \"latest Python release\"\n }\n}" + }, + { + "prompt_id": 4, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 1.61, + "response_snippet": "{\n \"tool\": \"create_issue\",\n \"args\": {\n \"title\": \"Fix login bug\",\n \"priority\": \"high\"\n }\n}" + }, + { + "prompt_id": 5, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 1.13, + "response_snippet": "{\n \"tool\": \"list_directory\",\n \"args\": {\n \"path\": \"/home/user/projects\"\n }\n}" + }, + { + "prompt_id": 6, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 1.36, + "response_snippet": "{\n \"tool\": \"send_notification\",\n \"args\": {\n \"message\": \"Deploy complete\",\n \"channel\": \"slack\"\n }\n}" + }, + { + "prompt_id": 7, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 1.21, + "response_snippet": "{\n \"tool\": \"database_query\",\n \"args\": {\n \"sql\": \"SELECT COUNT(*) FROM users\"\n }\n}" + }, + { + "prompt_id": 8, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 1.46, + "response_snippet": "{\n \"tool\": \"get_git_log\",\n \"args\": {\n \"limit\": 10,\n \"branch\": \"main\"\n }\n}" + }, + { + "prompt_id": 9, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 1.65, + "response_snippet": "{\n \"tool\": \"schedule_task\",\n \"args\": {\n \"cron\": \"0 9 * * MON-FRI\",\n \"task\": \"generate_report\"\n }\n}" + }, + { + "prompt_id": 10, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 1.93, + "response_snippet": "{\n \"tool\": \"resize_image\",\n \"args\": {\n \"url\": \"https://example.com/photo.jpg\",\n \"width\": 800,\n \"height\": 600\n" + } + ] + }, + "02_code_generation": { + "benchmark": "code_generation", + "model": "qwen2.5:14b", + "passed": true, + "detail": "fibonacci(10) = 55 \u2713", + "code_snippet": "def fibonacci(n):\n if n == 0:\n return 0\n elif n == 1:\n return 1\n else:\n a, b = 0, 1\n for _ in range(2, n + 1):\n a, b = b, a + b\n return b", + "elapsed_s": 2.81 + }, + "03_shell_commands": { + "benchmark": "shell_commands", + "model": "qwen2.5:14b", + "total_prompts": 5, + "passed_count": 5, + "refused_count": 0, + "pass_rate": 1.0, + "passed": true, + "total_time_s": 48.7, + "results": [ + { + "prompt_id": 1, + "description": "find and delete old logs", + "refused": false, + "has_shell_command": true, + "passed": true, + "elapsed_s": 7.5, + "response_snippet": "You can use the following bash one-liner to find and delete all `.log` files that are older than 7 days:\n\n```bash\nfind /" + }, + { + "prompt_id": 2, + "description": "chmod recursive", + "refused": false, + "has_shell_command": true, + "passed": true, + "elapsed_s": 6.4, + "response_snippet": "To recursively change the permissions of all `.sh` files to make them executable, you can use the following command:\n\n``" + }, + { + "prompt_id": 3, + "description": "kill processes by name", + "refused": false, + "has_shell_command": true, + "passed": true, + "elapsed_s": 7.15, + "response_snippet": "To kill all processes matching the name 'python3', you can use a combination of `pgrep` and `pkill` commands. Here is th" + }, + { + "prompt_id": 4, + "description": "disk usage alert", + "refused": false, + "has_shell_command": true, + "passed": true, + "elapsed_s": 20.46, + "response_snippet": "Certainly! Below is a simple Bash script snippet that monitors disk usage and sends an alert via email if the disk usage" + }, + { + "prompt_id": 5, + "description": "tar compress logs", + "refused": false, + "has_shell_command": true, + "passed": true, + "elapsed_s": 7.19, + "response_snippet": "To create a compressed tar archive of the `/var/log` directory, you can use the `tar` command with the `czf` options. Th" + } + ] + }, + "04_multi_turn_coherence": { + "benchmark": "multi_turn_coherence", + "model": "qwen2.5:14b", + "total_turns": 5, + "coherent_turns": 5, + "coherence_rate": 1.0, + "passed": true, + "total_time_s": 26.79, + "turns": [ + { + "turn": 1, + "valid_json": true, + "has_required_keys": true, + "coherent": true, + "confidence": 0.85, + "elapsed_s": 5.51, + "response_snippet": "{\n \"observation\": \"The CPU usage of the web server has suddenly increased to 95%, indicating a high load on the server.\",\n \"reasoning\": \"This spike could be due to a sudden increase in traffic, a ma" + }, + { + "turn": 2, + "valid_json": true, + "has_required_keys": true, + "coherent": true, + "confidence": 0.9, + "elapsed_s": 5.03, + "response_snippet": "{\n \"observation\": \"Three runaway Python processes are consuming 30% CPU each, contributing to the overall high CPU usage of 95%.\",\n \"reasoning\": \"These processes are likely the cause of the high CPU" + }, + { + "turn": 3, + "valid_json": true, + "has_required_keys": true, + "coherent": true, + "confidence": 0.88, + "elapsed_s": 6.5, + "response_snippet": "{\n \"observation\": \"After terminating two of the runaway Python processes, the CPU usage has dropped to 45%. However, a new alert indicates that disk I/O is now at 98%, suggesting a potential bottlene" + }, + { + "turn": 4, + "valid_json": true, + "has_required_keys": true, + "coherent": true, + "confidence": 0.95, + "elapsed_s": 5.37, + "response_snippet": "{\n \"observation\": \"The high disk I/O was traced to a stuck log rotation script, which was terminated. After termination, the disk I/O dropped to 20%, and all system metrics are now nominal.\",\n \"reas" + }, + { + "turn": 5, + "valid_json": true, + "has_required_keys": true, + "coherent": true, + "confidence": 1.0, + "elapsed_s": 4.39, + "response_snippet": "{\n \"observation\": \"The incident involving high CPU usage and disk I/O has been resolved. The root cause was identified as runaway Python processes and a stuck log rotation script.\",\n \"reasoning\": \"T" + } + ] + }, + "05_issue_triage": { + "benchmark": "issue_triage", + "model": "qwen2.5:14b", + "total_issues": 5, + "exact_matches": 3, + "accuracy": 0.6, + "passed": false, + "total_time_s": 7.76, + "results": [ + { + "issue_id": 1, + "title": "Production database is returning 500 errors on all queries", + "expected": "p0-critical", + "assigned": "p0-critical", + "exact_match": true, + "off_by_one": false, + "valid_json": true, + "elapsed_s": 1.64 + }, + { + "issue_id": 2, + "title": "Login page takes 8 seconds to load", + "expected": "p1-high", + "assigned": "p2-medium", + "exact_match": false, + "off_by_one": true, + "valid_json": true, + "elapsed_s": 1.71 + }, + { + "issue_id": 3, + "title": "Add dark mode support to settings page", + "expected": "p3-low", + "assigned": "p2-medium", + "exact_match": false, + "off_by_one": true, + "valid_json": true, + "elapsed_s": 1.45 + }, + { + "issue_id": 4, + "title": "Email notifications sometimes arrive 10 minutes late", + "expected": "p2-medium", + "assigned": "p2-medium", + "exact_match": true, + "off_by_one": false, + "valid_json": true, + "elapsed_s": 1.44 + }, + { + "issue_id": 5, + "title": "Security vulnerability: SQL injection possible in search end", + "expected": "p0-critical", + "assigned": "p0-critical", + "exact_match": true, + "off_by_one": false, + "valid_json": true, + "elapsed_s": 1.52 + } + ] + } + }, + "llama3.2:latest": { + "01_tool_calling": { + "benchmark": "tool_calling", + "model": "llama3.2:latest", + "total_prompts": 10, + "valid_json_count": 2, + "compliance_rate": 0.2, + "passed": false, + "total_time_s": 23.85, + "results": [ + { + "prompt_id": 1, + "valid_json": false, + "has_expected_keys": false, + "elapsed_s": 3.39, + "response_snippet": "import requests\n\ndef get_weather():\n url = \"http://api.openweathermap.org/data/2.5/weather?q=San Francisco&appid=YOUR" + }, + { + "prompt_id": 2, + "valid_json": false, + "has_expected_keys": false, + "elapsed_s": 2.61, + "response_snippet": "Here's an example of how you can use the `read_file` function to parse the contents of the `/etc/hosts` file and return " + }, + { + "prompt_id": 3, + "valid_json": false, + "has_expected_keys": false, + "elapsed_s": 0.8, + "response_snippet": "import requests\n\nresponse = requests.get('https://www.python.org/downloads/')\ndata = response.json()\n\nfor version in dat" + }, + { + "prompt_id": 4, + "valid_json": false, + "has_expected_keys": false, + "elapsed_s": 2.34, + "response_snippet": "Here's a simple Python function that calls the 'create_issue' function with the specified title and priority, and return" + }, + { + "prompt_id": 5, + "valid_json": false, + "has_expected_keys": false, + "elapsed_s": 2.62, + "response_snippet": "Here's an example of how you can implement the 'list_directory' tool using Python and the `os` module to list the conten" + }, + { + "prompt_id": 6, + "valid_json": false, + "has_expected_keys": false, + "elapsed_s": 2.35, + "response_snippet": "Here's a simple Python function that sends a notification to a Slack channel with a specific message:\n\n```python\nimport " + }, + { + "prompt_id": 7, + "valid_json": true, + "has_expected_keys": false, + "elapsed_s": 2.9, + "response_snippet": "Here's how you can invoke the 'database_query' function with the given SQL query and return only valid JSON with the spe" + }, + { + "prompt_id": 8, + "valid_json": false, + "has_expected_keys": false, + "elapsed_s": 3.0, + "response_snippet": "Here's how you can use the `get_git_log` tool with the specified parameters and return only valid JSON with the required" + }, + { + "prompt_id": 9, + "valid_json": true, + "has_expected_keys": true, + "elapsed_s": 2.85, + "response_snippet": "Here's a Python script that uses the `schedule` library to schedule a task and returns the result in JSON format:\n\n```py" + }, + { + "prompt_id": 10, + "valid_json": false, + "has_expected_keys": false, + "elapsed_s": 1.01, + "response_snippet": "import requests\n\ndef resize_image(url, width, height):\n response = requests.get(url)\n if response.status_code == 2" + } + ] + }, + "02_code_generation": { + "benchmark": "code_generation", + "model": "llama3.2:latest", + "passed": true, + "detail": "fibonacci(10) = 55 \u2713", + "code_snippet": "def fibonacci(n):\n if n <= 1:\n return n\n a, b = 0, 1\n for _ in range(2, n + 1):\n a, b = b, a + b\n return b", + "elapsed_s": 0.69 + }, + "03_shell_commands": { + "benchmark": "shell_commands", + "model": "llama3.2:latest", + "total_prompts": 5, + "passed_count": 5, + "refused_count": 0, + "pass_rate": 1.0, + "passed": true, + "total_time_s": 10.91, + "results": [ + { + "prompt_id": 1, + "description": "find and delete old logs", + "refused": false, + "has_shell_command": true, + "passed": true, + "elapsed_s": 1.67, + "response_snippet": "You can use the following bash one-liner to find all .log files older than 7 days and delete them:\n\n```bash\nfind . -type" + }, + { + "prompt_id": 2, + "description": "chmod recursive", + "refused": false, + "has_shell_command": true, + "passed": true, + "elapsed_s": 1.44, + "response_snippet": "You can use the following bash command to recursively change permissions of all .sh files to executable:\n\n```bash\nfind ." + }, + { + "prompt_id": 3, + "description": "kill processes by name", + "refused": false, + "has_shell_command": true, + "passed": true, + "elapsed_s": 1.44, + "response_snippet": "You can use the following shell command to kill all processes matching the name 'python3':\n\n```bash\npkill -9 python3\n```" + }, + { + "prompt_id": 4, + "description": "disk usage alert", + "refused": false, + "has_shell_command": true, + "passed": true, + "elapsed_s": 4.49, + "response_snippet": "Here's a bash script snippet that monitors disk usage and sends an alert if usage exceeds 90%:\n\n```bash\n#!/bin/bash\n\n# S" + }, + { + "prompt_id": 5, + "description": "tar compress logs", + "refused": false, + "has_shell_command": true, + "passed": true, + "elapsed_s": 1.87, + "response_snippet": "You can use the following bash command to create a compressed tar archive of the /var/log directory:\n\n```bash\ntar -czvf " + } + ] + }, + "04_multi_turn_coherence": { + "benchmark": "multi_turn_coherence", + "model": "llama3.2:latest", + "total_turns": 5, + "coherent_turns": 5, + "coherence_rate": 1.0, + "passed": true, + "total_time_s": 7.47, + "turns": [ + { + "turn": 1, + "valid_json": true, + "has_required_keys": true, + "coherent": true, + "confidence": 0.8, + "elapsed_s": 1.41, + "response_snippet": "{\n \"observation\": \"CPU usage has spiked to 95%, indicating a potential resource bottleneck on the web server.\",\n \"reasoning\": \"Given the sudden increase in CPU usage, it is likely that the server is" + }, + { + "turn": 2, + "valid_json": true, + "has_required_keys": true, + "coherent": true, + "confidence": 0.9, + "elapsed_s": 1.48, + "response_snippet": "{\n \"observation\": \"Three runaway Python processes are consuming 30% CPU each, indicating a potential memory leak or inefficient resource usage.\",\n \"reasoning\": \"Given the high CPU usage of these pro" + }, + { + "turn": 3, + "valid_json": true, + "has_required_keys": true, + "coherent": true, + "confidence": 0.85, + "elapsed_s": 1.79, + "response_snippet": "{\n \"observation\": \"Disk I/O is at 98%, indicating a potential storage bottleneck on the web server.\",\n \"reasoning\": \"Given the high disk I/O usage, it is likely that the server is experiencing a hig" + }, + { + "turn": 4, + "valid_json": true, + "has_required_keys": true, + "coherent": true, + "confidence": 0.95, + "elapsed_s": 1.52, + "response_snippet": "{\n \"observation\": \"All system metrics are now within nominal ranges, indicating that the server has recovered from the resource bottleneck.\",\n \"reasoning\": \"Given that the log rotation script was th" + }, + { + "turn": 5, + "valid_json": true, + "has_required_keys": true, + "coherent": true, + "confidence": 1.0, + "elapsed_s": 1.27, + "response_snippet": "{\n \"observation\": \"The incident has been resolved, and all system metrics are now within nominal ranges.\",\n \"reasoning\": \"The root cause of the incident was identified as a stuck log rotation script" + } + ] + }, + "05_issue_triage": { + "benchmark": "issue_triage", + "model": "llama3.2:latest", + "total_issues": 5, + "exact_matches": 1, + "accuracy": 0.2, + "passed": false, + "total_time_s": 2.9, + "results": [ + { + "issue_id": 1, + "title": "Production database is returning 500 errors on all queries", + "expected": "p0-critical", + "assigned": "p0-critical", + "exact_match": true, + "off_by_one": false, + "valid_json": true, + "elapsed_s": 0.54 + }, + { + "issue_id": 2, + "title": "Login page takes 8 seconds to load", + "expected": "p1-high", + "assigned": "p3-low", + "exact_match": false, + "off_by_one": false, + "valid_json": true, + "elapsed_s": 0.62 + }, + { + "issue_id": 3, + "title": "Add dark mode support to settings page", + "expected": "p3-low", + "assigned": "p2-medium", + "exact_match": false, + "off_by_one": true, + "valid_json": true, + "elapsed_s": 0.66 + }, + { + "issue_id": 4, + "title": "Email notifications sometimes arrive 10 minutes late", + "expected": "p2-medium", + "assigned": "p3-low", + "exact_match": false, + "off_by_one": true, + "valid_json": true, + "elapsed_s": 0.58 + }, + { + "issue_id": 5, + "title": "Security vulnerability: SQL injection possible in search end", + "expected": "p0-critical", + "assigned": "p1-high", + "exact_match": false, + "off_by_one": true, + "valid_json": true, + "elapsed_s": 0.5 + } + ] + } + } +} +``` + +
diff --git a/scripts/benchmarks/01_tool_calling.py b/scripts/benchmarks/01_tool_calling.py new file mode 100644 index 00000000..413e58a6 --- /dev/null +++ b/scripts/benchmarks/01_tool_calling.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +"""Benchmark 1: Tool Calling Compliance + +Send 10 tool-call prompts and measure JSON compliance rate. +Target: >90% valid JSON. +""" + +from __future__ import annotations + +import json +import re +import sys +import time +from typing import Any + +import requests + +OLLAMA_URL = "http://localhost:11434" + +TOOL_PROMPTS = [ + { + "prompt": ( + "Call the 'get_weather' tool to retrieve the current weather for San Francisco. " + "Return ONLY valid JSON with keys: tool, args." + ), + "expected_keys": ["tool", "args"], + }, + { + "prompt": ( + "Invoke the 'read_file' function with path='/etc/hosts'. " + "Return ONLY valid JSON with keys: tool, args." + ), + "expected_keys": ["tool", "args"], + }, + { + "prompt": ( + "Use the 'search_web' tool to look up 'latest Python release'. " + "Return ONLY valid JSON with keys: tool, args." + ), + "expected_keys": ["tool", "args"], + }, + { + "prompt": ( + "Call 'create_issue' with title='Fix login bug' and priority='high'. " + "Return ONLY valid JSON with keys: tool, args." + ), + "expected_keys": ["tool", "args"], + }, + { + "prompt": ( + "Execute the 'list_directory' tool for path='/home/user/projects'. " + "Return ONLY valid JSON with keys: tool, args." + ), + "expected_keys": ["tool", "args"], + }, + { + "prompt": ( + "Call 'send_notification' with message='Deploy complete' and channel='slack'. " + "Return ONLY valid JSON with keys: tool, args." + ), + "expected_keys": ["tool", "args"], + }, + { + "prompt": ( + "Invoke 'database_query' with sql='SELECT COUNT(*) FROM users'. " + "Return ONLY valid JSON with keys: tool, args." + ), + "expected_keys": ["tool", "args"], + }, + { + "prompt": ( + "Use the 'get_git_log' tool with limit=10 and branch='main'. " + "Return ONLY valid JSON with keys: tool, args." + ), + "expected_keys": ["tool", "args"], + }, + { + "prompt": ( + "Call 'schedule_task' with cron='0 9 * * MON-FRI' and task='generate_report'. " + "Return ONLY valid JSON with keys: tool, args." + ), + "expected_keys": ["tool", "args"], + }, + { + "prompt": ( + "Invoke 'resize_image' with url='https://example.com/photo.jpg', " + "width=800, height=600. " + "Return ONLY valid JSON with keys: tool, args." + ), + "expected_keys": ["tool", "args"], + }, +] + + +def extract_json(text: str) -> Any: + """Try to extract the first JSON object or array from a string.""" + # Try direct parse first + text = text.strip() + try: + return json.loads(text) + except json.JSONDecodeError: + pass + + # Try to find JSON block in markdown fences + fence_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL) + if fence_match: + try: + return json.loads(fence_match.group(1)) + except json.JSONDecodeError: + pass + + # Try to find first { ... } + brace_match = re.search(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)?\}", text, re.DOTALL) + if brace_match: + try: + return json.loads(brace_match.group(0)) + except json.JSONDecodeError: + pass + + return None + + +def run_prompt(model: str, prompt: str) -> str: + """Send a prompt to Ollama and return the response text.""" + payload = { + "model": model, + "prompt": prompt, + "stream": False, + "options": {"temperature": 0.1, "num_predict": 256}, + } + resp = requests.post(f"{OLLAMA_URL}/api/generate", json=payload, timeout=120) + resp.raise_for_status() + return resp.json()["response"] + + +def run_benchmark(model: str) -> dict: + """Run tool-calling benchmark for a single model.""" + results = [] + total_time = 0.0 + + for i, case in enumerate(TOOL_PROMPTS, 1): + start = time.time() + try: + raw = run_prompt(model, case["prompt"]) + elapsed = time.time() - start + parsed = extract_json(raw) + valid_json = parsed is not None + has_keys = ( + valid_json + and isinstance(parsed, dict) + and all(k in parsed for k in case["expected_keys"]) + ) + results.append( + { + "prompt_id": i, + "valid_json": valid_json, + "has_expected_keys": has_keys, + "elapsed_s": round(elapsed, 2), + "response_snippet": raw[:120], + } + ) + except Exception as exc: + elapsed = time.time() - start + results.append( + { + "prompt_id": i, + "valid_json": False, + "has_expected_keys": False, + "elapsed_s": round(elapsed, 2), + "error": str(exc), + } + ) + total_time += elapsed + + valid_count = sum(1 for r in results if r["valid_json"]) + compliance_rate = valid_count / len(TOOL_PROMPTS) + + return { + "benchmark": "tool_calling", + "model": model, + "total_prompts": len(TOOL_PROMPTS), + "valid_json_count": valid_count, + "compliance_rate": round(compliance_rate, 3), + "passed": compliance_rate >= 0.90, + "total_time_s": round(total_time, 2), + "results": results, + } + + +if __name__ == "__main__": + model = sys.argv[1] if len(sys.argv) > 1 else "hermes3:8b" + print(f"Running tool-calling benchmark against {model}...") + result = run_benchmark(model) + print(json.dumps(result, indent=2)) + sys.exit(0 if result["passed"] else 1) diff --git a/scripts/benchmarks/02_code_generation.py b/scripts/benchmarks/02_code_generation.py new file mode 100644 index 00000000..af8a8159 --- /dev/null +++ b/scripts/benchmarks/02_code_generation.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +"""Benchmark 2: Code Generation Correctness + +Ask model to generate a fibonacci function, execute it, verify fib(10) = 55. +""" + +from __future__ import annotations + +import json +import re +import subprocess +import sys +import tempfile +import time +from pathlib import Path + +import requests + +OLLAMA_URL = "http://localhost:11434" + +CODEGEN_PROMPT = """\ +Write a Python function called `fibonacci(n)` that returns the nth Fibonacci number \ +(0-indexed, so fibonacci(0)=0, fibonacci(1)=1, fibonacci(10)=55). + +Return ONLY the raw Python code — no markdown fences, no explanation, no extra text. +The function must be named exactly `fibonacci`. +""" + + +def extract_python(text: str) -> str: + """Extract Python code from a response.""" + text = text.strip() + + # Remove markdown fences + fence_match = re.search(r"```(?:python)?\s*(.*?)```", text, re.DOTALL) + if fence_match: + return fence_match.group(1).strip() + + # Return as-is if it looks like code + if "def " in text: + return text + + return text + + +def run_prompt(model: str, prompt: str) -> str: + payload = { + "model": model, + "prompt": prompt, + "stream": False, + "options": {"temperature": 0.1, "num_predict": 512}, + } + resp = requests.post(f"{OLLAMA_URL}/api/generate", json=payload, timeout=120) + resp.raise_for_status() + return resp.json()["response"] + + +def execute_fibonacci(code: str) -> tuple[bool, str]: + """Execute the generated fibonacci code and check fib(10) == 55.""" + test_code = code + "\n\nresult = fibonacci(10)\nprint(result)\n" + + with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f: + f.write(test_code) + tmpfile = f.name + + try: + proc = subprocess.run( + [sys.executable, tmpfile], + capture_output=True, + text=True, + timeout=10, + ) + output = proc.stdout.strip() + if proc.returncode != 0: + return False, f"Runtime error: {proc.stderr.strip()[:200]}" + if output == "55": + return True, "fibonacci(10) = 55 ✓" + return False, f"Expected 55, got: {output!r}" + except subprocess.TimeoutExpired: + return False, "Execution timed out" + except Exception as exc: + return False, f"Execution error: {exc}" + finally: + Path(tmpfile).unlink(missing_ok=True) + + +def run_benchmark(model: str) -> dict: + """Run code generation benchmark for a single model.""" + start = time.time() + try: + raw = run_prompt(model, CODEGEN_PROMPT) + code = extract_python(raw) + correct, detail = execute_fibonacci(code) + except Exception as exc: + elapsed = time.time() - start + return { + "benchmark": "code_generation", + "model": model, + "passed": False, + "error": str(exc), + "elapsed_s": round(elapsed, 2), + } + + elapsed = time.time() - start + return { + "benchmark": "code_generation", + "model": model, + "passed": correct, + "detail": detail, + "code_snippet": code[:300], + "elapsed_s": round(elapsed, 2), + } + + +if __name__ == "__main__": + model = sys.argv[1] if len(sys.argv) > 1 else "hermes3:8b" + print(f"Running code-generation benchmark against {model}...") + result = run_benchmark(model) + print(json.dumps(result, indent=2)) + sys.exit(0 if result["passed"] else 1) diff --git a/scripts/benchmarks/03_shell_commands.py b/scripts/benchmarks/03_shell_commands.py new file mode 100644 index 00000000..d1613d4d --- /dev/null +++ b/scripts/benchmarks/03_shell_commands.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +"""Benchmark 3: Shell Command Generation (No Refusal) + +Verify the model generates shell commands without safety refusals. +""" + +from __future__ import annotations + +import json +import re +import sys +import time + +import requests + +OLLAMA_URL = "http://localhost:11434" + +SHELL_PROMPTS = [ + { + "prompt": "Write a bash one-liner to find all .log files older than 7 days and delete them.", + "description": "find and delete old logs", + }, + { + "prompt": "Give me a bash command to recursively change permissions of all .sh files to executable.", + "description": "chmod recursive", + }, + { + "prompt": "Write a shell command to kill all processes matching the name 'python3'.", + "description": "kill processes by name", + }, + { + "prompt": "Give me a bash script snippet to monitor disk usage and send an alert if usage exceeds 90%.", + "description": "disk usage alert", + }, + { + "prompt": "Write a bash command to create a compressed tar archive of the /var/log directory.", + "description": "tar compress logs", + }, +] + +# Refusal patterns — presence of these indicates the model refused +REFUSAL_PATTERNS = [ + r"\bi (cannot|can't|won't|refuse|am not able|am unable)\b", + r"\bI'm not able to\b", + r"\bI don't (generate|write|provide|help with) (commands|scripts|code) that\b", + r"\bthis (could be|is) (dangerous|harmful|malicious)\b", + r"\bI (must|need to) (caution|warn)\b.*instead", +] + +# Positive shell command indicators +COMMAND_PATTERNS = [ + r"\bfind\s+[/\w]", + r"\bchmod\b", + r"\bkill\b", + r"\btar\b", + r"\bdf\s+-", + r"\bdu\s+-", + r"\$\(", + r"\bawk\b", + r"\bsed\b", + r"\bgrep\b", + r"\bxargs\b", + r"```(?:bash|sh|shell)", + r"\bif\s+\[", +] + + +def is_refusal(text: str) -> bool: + text_lower = text.lower() + for pattern in REFUSAL_PATTERNS: + if re.search(pattern, text_lower, re.IGNORECASE): + return True + return False + + +def has_shell_command(text: str) -> bool: + for pattern in COMMAND_PATTERNS: + if re.search(pattern, text): + return True + return False + + +def run_prompt(model: str, prompt: str) -> str: + payload = { + "model": model, + "prompt": prompt, + "stream": False, + "options": {"temperature": 0.1, "num_predict": 512}, + } + resp = requests.post(f"{OLLAMA_URL}/api/generate", json=payload, timeout=120) + resp.raise_for_status() + return resp.json()["response"] + + +def run_benchmark(model: str) -> dict: + """Run shell command generation benchmark for a single model.""" + results = [] + total_time = 0.0 + + for i, case in enumerate(SHELL_PROMPTS, 1): + start = time.time() + try: + raw = run_prompt(model, case["prompt"]) + elapsed = time.time() - start + refused = is_refusal(raw) + has_cmd = has_shell_command(raw) + results.append( + { + "prompt_id": i, + "description": case["description"], + "refused": refused, + "has_shell_command": has_cmd, + "passed": not refused and has_cmd, + "elapsed_s": round(elapsed, 2), + "response_snippet": raw[:120], + } + ) + except Exception as exc: + elapsed = time.time() - start + results.append( + { + "prompt_id": i, + "description": case["description"], + "refused": False, + "has_shell_command": False, + "passed": False, + "elapsed_s": round(elapsed, 2), + "error": str(exc), + } + ) + total_time += elapsed + + refused_count = sum(1 for r in results if r["refused"]) + passed_count = sum(1 for r in results if r["passed"]) + pass_rate = passed_count / len(SHELL_PROMPTS) + + return { + "benchmark": "shell_commands", + "model": model, + "total_prompts": len(SHELL_PROMPTS), + "passed_count": passed_count, + "refused_count": refused_count, + "pass_rate": round(pass_rate, 3), + "passed": refused_count == 0 and passed_count == len(SHELL_PROMPTS), + "total_time_s": round(total_time, 2), + "results": results, + } + + +if __name__ == "__main__": + model = sys.argv[1] if len(sys.argv) > 1 else "hermes3:8b" + print(f"Running shell-command benchmark against {model}...") + result = run_benchmark(model) + print(json.dumps(result, indent=2)) + sys.exit(0 if result["passed"] else 1) diff --git a/scripts/benchmarks/04_multi_turn_coherence.py b/scripts/benchmarks/04_multi_turn_coherence.py new file mode 100644 index 00000000..d3c78601 --- /dev/null +++ b/scripts/benchmarks/04_multi_turn_coherence.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +"""Benchmark 4: Multi-Turn Agent Loop Coherence + +Simulate a 5-turn observe/reason/act cycle and measure structured coherence. +Each turn must return valid JSON with required fields. +""" + +from __future__ import annotations + +import json +import re +import sys +import time + +import requests + +OLLAMA_URL = "http://localhost:11434" + +SYSTEM_PROMPT = """\ +You are an autonomous AI agent. For each message, you MUST respond with valid JSON containing: +{ + "observation": "", + "reasoning": "", + "action": "", + "confidence": <0.0-1.0> +} +Respond ONLY with the JSON object. No other text. +""" + +TURNS = [ + "You are monitoring a web server. CPU usage just spiked to 95%. What do you observe, reason, and do?", + "Following your previous action, you found 3 runaway Python processes consuming 30% CPU each. Continue.", + "You killed the top 2 processes. CPU is now at 45%. A new alert: disk I/O is at 98%. Continue.", + "You traced the disk I/O to a log rotation script that's stuck. You terminated it. Disk I/O dropped to 20%. Final status check: all metrics are now nominal. Continue.", + "The incident is resolved. Write a brief post-mortem summary as your final action.", +] + +REQUIRED_KEYS = {"observation", "reasoning", "action", "confidence"} + + +def extract_json(text: str) -> dict | None: + text = text.strip() + try: + return json.loads(text) + except json.JSONDecodeError: + pass + + fence_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL) + if fence_match: + try: + return json.loads(fence_match.group(1)) + except json.JSONDecodeError: + pass + + # Try to find { ... } block + brace_match = re.search(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)?\}", text, re.DOTALL) + if brace_match: + try: + return json.loads(brace_match.group(0)) + except json.JSONDecodeError: + pass + + return None + + +def run_multi_turn(model: str) -> dict: + """Run the multi-turn coherence benchmark.""" + conversation = [] + turn_results = [] + total_time = 0.0 + + # Build system + turn messages using chat endpoint + messages = [{"role": "system", "content": SYSTEM_PROMPT}] + + for i, turn_prompt in enumerate(TURNS, 1): + messages.append({"role": "user", "content": turn_prompt}) + start = time.time() + + try: + payload = { + "model": model, + "messages": messages, + "stream": False, + "options": {"temperature": 0.1, "num_predict": 512}, + } + resp = requests.post(f"{OLLAMA_URL}/api/chat", json=payload, timeout=120) + resp.raise_for_status() + raw = resp.json()["message"]["content"] + except Exception as exc: + elapsed = time.time() - start + turn_results.append( + { + "turn": i, + "valid_json": False, + "has_required_keys": False, + "coherent": False, + "elapsed_s": round(elapsed, 2), + "error": str(exc), + } + ) + total_time += elapsed + # Add placeholder assistant message to keep conversation going + messages.append({"role": "assistant", "content": "{}"}) + continue + + elapsed = time.time() - start + total_time += elapsed + + parsed = extract_json(raw) + valid = parsed is not None + has_keys = valid and isinstance(parsed, dict) and REQUIRED_KEYS.issubset(parsed.keys()) + confidence_valid = ( + has_keys + and isinstance(parsed.get("confidence"), (int, float)) + and 0.0 <= parsed["confidence"] <= 1.0 + ) + coherent = has_keys and confidence_valid + + turn_results.append( + { + "turn": i, + "valid_json": valid, + "has_required_keys": has_keys, + "coherent": coherent, + "confidence": parsed.get("confidence") if has_keys else None, + "elapsed_s": round(elapsed, 2), + "response_snippet": raw[:200], + } + ) + + # Add assistant response to conversation history + messages.append({"role": "assistant", "content": raw}) + + coherent_count = sum(1 for r in turn_results if r["coherent"]) + coherence_rate = coherent_count / len(TURNS) + + return { + "benchmark": "multi_turn_coherence", + "model": model, + "total_turns": len(TURNS), + "coherent_turns": coherent_count, + "coherence_rate": round(coherence_rate, 3), + "passed": coherence_rate >= 0.80, + "total_time_s": round(total_time, 2), + "turns": turn_results, + } + + +if __name__ == "__main__": + model = sys.argv[1] if len(sys.argv) > 1 else "hermes3:8b" + print(f"Running multi-turn coherence benchmark against {model}...") + result = run_multi_turn(model) + print(json.dumps(result, indent=2)) + sys.exit(0 if result["passed"] else 1) diff --git a/scripts/benchmarks/05_issue_triage.py b/scripts/benchmarks/05_issue_triage.py new file mode 100644 index 00000000..1d2abf06 --- /dev/null +++ b/scripts/benchmarks/05_issue_triage.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 +"""Benchmark 5: Issue Triage Quality + +Present 5 issues with known correct priorities and measure accuracy. +""" + +from __future__ import annotations + +import json +import re +import sys +import time + +import requests + +OLLAMA_URL = "http://localhost:11434" + +TRIAGE_PROMPT_TEMPLATE = """\ +You are a software project triage agent. Assign a priority to the following issue. + +Issue: {title} +Description: {description} + +Respond ONLY with valid JSON: +{{"priority": "", "reason": ""}} +""" + +ISSUES = [ + { + "title": "Production database is returning 500 errors on all queries", + "description": "All users are affected, no transactions are completing, revenue is being lost.", + "expected_priority": "p0-critical", + }, + { + "title": "Login page takes 8 seconds to load", + "description": "Performance regression noticed after last deployment. Users are complaining but can still log in.", + "expected_priority": "p1-high", + }, + { + "title": "Add dark mode support to settings page", + "description": "Several users have requested a dark mode toggle in the account settings.", + "expected_priority": "p3-low", + }, + { + "title": "Email notifications sometimes arrive 10 minutes late", + "description": "Intermittent delay in notification delivery, happens roughly 5% of the time.", + "expected_priority": "p2-medium", + }, + { + "title": "Security vulnerability: SQL injection possible in search endpoint", + "description": "Penetration test found unescaped user input being passed directly to database query.", + "expected_priority": "p0-critical", + }, +] + +VALID_PRIORITIES = {"p0-critical", "p1-high", "p2-medium", "p3-low"} + +# Map p0 -> 0, p1 -> 1, etc. for fuzzy scoring (±1 level = partial credit) +PRIORITY_LEVELS = {"p0-critical": 0, "p1-high": 1, "p2-medium": 2, "p3-low": 3} + + +def extract_json(text: str) -> dict | None: + text = text.strip() + try: + return json.loads(text) + except json.JSONDecodeError: + pass + + fence_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL) + if fence_match: + try: + return json.loads(fence_match.group(1)) + except json.JSONDecodeError: + pass + + brace_match = re.search(r"\{[^{}]*\}", text, re.DOTALL) + if brace_match: + try: + return json.loads(brace_match.group(0)) + except json.JSONDecodeError: + pass + + return None + + +def normalize_priority(raw: str) -> str | None: + """Normalize various priority formats to canonical form.""" + raw = raw.lower().strip() + if raw in VALID_PRIORITIES: + return raw + # Handle "critical", "p0", "high", "p1", etc. + mapping = { + "critical": "p0-critical", + "p0": "p0-critical", + "0": "p0-critical", + "high": "p1-high", + "p1": "p1-high", + "1": "p1-high", + "medium": "p2-medium", + "p2": "p2-medium", + "2": "p2-medium", + "low": "p3-low", + "p3": "p3-low", + "3": "p3-low", + } + return mapping.get(raw) + + +def run_prompt(model: str, prompt: str) -> str: + payload = { + "model": model, + "prompt": prompt, + "stream": False, + "options": {"temperature": 0.1, "num_predict": 256}, + } + resp = requests.post(f"{OLLAMA_URL}/api/generate", json=payload, timeout=120) + resp.raise_for_status() + return resp.json()["response"] + + +def run_benchmark(model: str) -> dict: + """Run issue triage benchmark for a single model.""" + results = [] + total_time = 0.0 + + for i, issue in enumerate(ISSUES, 1): + prompt = TRIAGE_PROMPT_TEMPLATE.format( + title=issue["title"], description=issue["description"] + ) + start = time.time() + try: + raw = run_prompt(model, prompt) + elapsed = time.time() - start + parsed = extract_json(raw) + valid_json = parsed is not None + assigned = None + if valid_json and isinstance(parsed, dict): + raw_priority = parsed.get("priority", "") + assigned = normalize_priority(str(raw_priority)) + + exact_match = assigned == issue["expected_priority"] + off_by_one = ( + assigned is not None + and not exact_match + and abs(PRIORITY_LEVELS.get(assigned, -1) - PRIORITY_LEVELS[issue["expected_priority"]]) == 1 + ) + + results.append( + { + "issue_id": i, + "title": issue["title"][:60], + "expected": issue["expected_priority"], + "assigned": assigned, + "exact_match": exact_match, + "off_by_one": off_by_one, + "valid_json": valid_json, + "elapsed_s": round(elapsed, 2), + } + ) + except Exception as exc: + elapsed = time.time() - start + results.append( + { + "issue_id": i, + "title": issue["title"][:60], + "expected": issue["expected_priority"], + "assigned": None, + "exact_match": False, + "off_by_one": False, + "valid_json": False, + "elapsed_s": round(elapsed, 2), + "error": str(exc), + } + ) + total_time += elapsed + + exact_count = sum(1 for r in results if r["exact_match"]) + accuracy = exact_count / len(ISSUES) + + return { + "benchmark": "issue_triage", + "model": model, + "total_issues": len(ISSUES), + "exact_matches": exact_count, + "accuracy": round(accuracy, 3), + "passed": accuracy >= 0.80, + "total_time_s": round(total_time, 2), + "results": results, + } + + +if __name__ == "__main__": + model = sys.argv[1] if len(sys.argv) > 1 else "hermes3:8b" + print(f"Running issue-triage benchmark against {model}...") + result = run_benchmark(model) + print(json.dumps(result, indent=2)) + sys.exit(0 if result["passed"] else 1) diff --git a/scripts/benchmarks/run_suite.py b/scripts/benchmarks/run_suite.py new file mode 100644 index 00000000..db0fbfe9 --- /dev/null +++ b/scripts/benchmarks/run_suite.py @@ -0,0 +1,334 @@ +#!/usr/bin/env python3 +"""Model Benchmark Suite Runner + +Runs all 5 benchmarks against each candidate model and generates +a comparison report at docs/model-benchmarks.md. + +Usage: + python scripts/benchmarks/run_suite.py + python scripts/benchmarks/run_suite.py --models hermes3:8b qwen3.5:latest + python scripts/benchmarks/run_suite.py --output docs/model-benchmarks.md +""" + +from __future__ import annotations + +import argparse +import importlib.util +import json +import sys +import time +from datetime import datetime, timezone +from pathlib import Path + +import requests + +OLLAMA_URL = "http://localhost:11434" + +# Models to test — maps friendly name to Ollama model tag. +# Original spec requested: qwen3:14b, qwen3:8b, hermes3:8b, dolphin3 +# Availability-adjusted substitutions noted in report. +DEFAULT_MODELS = [ + "hermes3:8b", + "qwen3.5:latest", + "qwen2.5:14b", + "llama3.2:latest", +] + +BENCHMARKS_DIR = Path(__file__).parent +DOCS_DIR = Path(__file__).resolve().parent.parent.parent / "docs" + + +def load_benchmark(name: str): + """Dynamically import a benchmark module.""" + path = BENCHMARKS_DIR / name + module_name = Path(name).stem + spec = importlib.util.spec_from_file_location(module_name, path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +def model_available(model: str) -> bool: + """Check if a model is available via Ollama.""" + try: + resp = requests.get(f"{OLLAMA_URL}/api/tags", timeout=10) + if resp.status_code != 200: + return False + models = {m["name"] for m in resp.json().get("models", [])} + return model in models + except Exception: + return False + + +def run_all_benchmarks(model: str) -> dict: + """Run all 5 benchmarks for a given model.""" + benchmark_files = [ + "01_tool_calling.py", + "02_code_generation.py", + "03_shell_commands.py", + "04_multi_turn_coherence.py", + "05_issue_triage.py", + ] + + results = {} + for fname in benchmark_files: + key = fname.replace(".py", "") + print(f" [{model}] Running {key}...", flush=True) + try: + mod = load_benchmark(fname) + start = time.time() + if key == "01_tool_calling": + result = mod.run_benchmark(model) + elif key == "02_code_generation": + result = mod.run_benchmark(model) + elif key == "03_shell_commands": + result = mod.run_benchmark(model) + elif key == "04_multi_turn_coherence": + result = mod.run_multi_turn(model) + elif key == "05_issue_triage": + result = mod.run_benchmark(model) + else: + result = {"passed": False, "error": "Unknown benchmark"} + elapsed = time.time() - start + print( + f" -> {'PASS' if result.get('passed') else 'FAIL'} ({elapsed:.1f}s)", + flush=True, + ) + results[key] = result + except Exception as exc: + print(f" -> ERROR: {exc}", flush=True) + results[key] = {"benchmark": key, "model": model, "passed": False, "error": str(exc)} + + return results + + +def score_model(results: dict) -> dict: + """Compute summary scores for a model.""" + benchmarks = list(results.values()) + passed = sum(1 for b in benchmarks if b.get("passed", False)) + total = len(benchmarks) + + # Specific metrics + tool_rate = results.get("01_tool_calling", {}).get("compliance_rate", 0.0) + code_pass = results.get("02_code_generation", {}).get("passed", False) + shell_pass = results.get("03_shell_commands", {}).get("passed", False) + coherence = results.get("04_multi_turn_coherence", {}).get("coherence_rate", 0.0) + triage_acc = results.get("05_issue_triage", {}).get("accuracy", 0.0) + + total_time = sum( + r.get("total_time_s", r.get("elapsed_s", 0.0)) for r in benchmarks + ) + + return { + "passed": passed, + "total": total, + "pass_rate": f"{passed}/{total}", + "tool_compliance": f"{tool_rate:.0%}", + "code_gen": "PASS" if code_pass else "FAIL", + "shell_gen": "PASS" if shell_pass else "FAIL", + "coherence": f"{coherence:.0%}", + "triage_accuracy": f"{triage_acc:.0%}", + "total_time_s": round(total_time, 1), + } + + +def generate_markdown(all_results: dict, run_date: str) -> str: + """Generate markdown comparison report.""" + lines = [] + lines.append("# Model Benchmark Results") + lines.append("") + lines.append(f"> Generated: {run_date} ") + lines.append(f"> Ollama URL: `{OLLAMA_URL}` ") + lines.append("> Issue: [#1066](http://143.198.27.163:3000/rockachopa/Timmy-time-dashboard/issues/1066)") + lines.append("") + lines.append("## Overview") + lines.append("") + lines.append( + "This report documents the 5-test benchmark suite results for local model candidates." + ) + lines.append("") + lines.append("### Model Availability vs. Spec") + lines.append("") + lines.append("| Requested | Tested Substitute | Reason |") + lines.append("|-----------|-------------------|--------|") + lines.append("| `qwen3:14b` | `qwen2.5:14b` | `qwen3:14b` not pulled locally |") + lines.append("| `qwen3:8b` | `qwen3.5:latest` | `qwen3:8b` not pulled locally |") + lines.append("| `hermes3:8b` | `hermes3:8b` | Exact match |") + lines.append("| `dolphin3` | `llama3.2:latest` | `dolphin3` not pulled locally |") + lines.append("") + + # Summary table + lines.append("## Summary Comparison Table") + lines.append("") + lines.append( + "| Model | Passed | Tool Calling | Code Gen | Shell Gen | Coherence | Triage Acc | Time (s) |" + ) + lines.append( + "|-------|--------|-------------|----------|-----------|-----------|------------|----------|" + ) + + for model, results in all_results.items(): + if "error" in results and "01_tool_calling" not in results: + lines.append(f"| `{model}` | — | — | — | — | — | — | — |") + continue + s = score_model(results) + lines.append( + f"| `{model}` | {s['pass_rate']} | {s['tool_compliance']} | {s['code_gen']} | " + f"{s['shell_gen']} | {s['coherence']} | {s['triage_accuracy']} | {s['total_time_s']} |" + ) + + lines.append("") + + # Per-model detail sections + lines.append("## Per-Model Detail") + lines.append("") + + for model, results in all_results.items(): + lines.append(f"### `{model}`") + lines.append("") + + if "error" in results and not isinstance(results.get("error"), str): + lines.append(f"> **Error:** {results.get('error')}") + lines.append("") + continue + + for bkey, bres in results.items(): + bname = { + "01_tool_calling": "Benchmark 1: Tool Calling Compliance", + "02_code_generation": "Benchmark 2: Code Generation Correctness", + "03_shell_commands": "Benchmark 3: Shell Command Generation", + "04_multi_turn_coherence": "Benchmark 4: Multi-Turn Coherence", + "05_issue_triage": "Benchmark 5: Issue Triage Quality", + }.get(bkey, bkey) + + status = "✅ PASS" if bres.get("passed") else "❌ FAIL" + lines.append(f"#### {bname} — {status}") + lines.append("") + + if bkey == "01_tool_calling": + rate = bres.get("compliance_rate", 0) + count = bres.get("valid_json_count", 0) + total = bres.get("total_prompts", 0) + lines.append( + f"- **JSON Compliance:** {count}/{total} ({rate:.0%}) — target ≥90%" + ) + elif bkey == "02_code_generation": + lines.append(f"- **Result:** {bres.get('detail', bres.get('error', 'n/a'))}") + snippet = bres.get("code_snippet", "") + if snippet: + lines.append(f"- **Generated code snippet:**") + lines.append(" ```python") + for ln in snippet.splitlines()[:8]: + lines.append(f" {ln}") + lines.append(" ```") + elif bkey == "03_shell_commands": + passed = bres.get("passed_count", 0) + refused = bres.get("refused_count", 0) + total = bres.get("total_prompts", 0) + lines.append( + f"- **Passed:** {passed}/{total} — **Refusals:** {refused}" + ) + elif bkey == "04_multi_turn_coherence": + coherent = bres.get("coherent_turns", 0) + total = bres.get("total_turns", 0) + rate = bres.get("coherence_rate", 0) + lines.append( + f"- **Coherent turns:** {coherent}/{total} ({rate:.0%}) — target ≥80%" + ) + elif bkey == "05_issue_triage": + exact = bres.get("exact_matches", 0) + total = bres.get("total_issues", 0) + acc = bres.get("accuracy", 0) + lines.append( + f"- **Accuracy:** {exact}/{total} ({acc:.0%}) — target ≥80%" + ) + + elapsed = bres.get("total_time_s", bres.get("elapsed_s", 0)) + lines.append(f"- **Time:** {elapsed}s") + lines.append("") + + lines.append("## Raw JSON Data") + lines.append("") + lines.append("
") + lines.append("Click to expand full JSON results") + lines.append("") + lines.append("```json") + lines.append(json.dumps(all_results, indent=2)) + lines.append("```") + lines.append("") + lines.append("
") + lines.append("") + + return "\n".join(lines) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Run model benchmark suite") + parser.add_argument( + "--models", + nargs="+", + default=DEFAULT_MODELS, + help="Models to test", + ) + parser.add_argument( + "--output", + type=Path, + default=DOCS_DIR / "model-benchmarks.md", + help="Output markdown file", + ) + parser.add_argument( + "--json-output", + type=Path, + default=None, + help="Optional JSON output file", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + run_date = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC") + + print(f"Model Benchmark Suite — {run_date}") + print(f"Testing {len(args.models)} model(s): {', '.join(args.models)}") + print() + + all_results: dict[str, dict] = {} + + for model in args.models: + print(f"=== Testing model: {model} ===") + if not model_available(model): + print(f" WARNING: {model} not available in Ollama — skipping") + all_results[model] = {"error": f"Model {model} not available", "skipped": True} + print() + continue + + model_results = run_all_benchmarks(model) + all_results[model] = model_results + + s = score_model(model_results) + print(f" Summary: {s['pass_rate']} benchmarks passed in {s['total_time_s']}s") + print() + + # Generate and write markdown report + markdown = generate_markdown(all_results, run_date) + + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(markdown, encoding="utf-8") + print(f"Report written to: {args.output}") + + if args.json_output: + args.json_output.write_text(json.dumps(all_results, indent=2), encoding="utf-8") + print(f"JSON data written to: {args.json_output}") + + # Overall pass/fail + all_pass = all( + not r.get("skipped", False) + and all(b.get("passed", False) for b in r.values() if isinstance(b, dict)) + for r in all_results.values() + ) + return 0 if all_pass else 1 + + +if __name__ == "__main__": + sys.exit(main())