All checks were successful
Smoke Test / smoke (pull_request) Successful in 25s
- Add benchmarks/run_bonsai_compare.py (script + quality scores) - Add benchmarks/bonsai-1bit-comparison-2025-10-06.md (published M4 Pro refs) - Add bonsai_results_seed.json with published numbers - README link to new benchmark page Closes #100
89 lines
2.7 KiB
JSON
89 lines
2.7 KiB
JSON
{
|
|
"generated_at": "2025-10-06T00:00:00.000Z",
|
|
"host_platform": "darwin",
|
|
"notes": "Pre-seeded results file — numbers sourced from Prism ML model READMEs (published M4 Pro Metal measurements). Replace with locally-generated file by running benchmarks/run_bonsai_compare.py.",
|
|
"source": "https://huggingface.co/prism-ml/Bonsai-8B-gguf (and -4B, -1.7B repos)",
|
|
"methodology": "llama-cli --timings, prompt='Once upon a time', 128 tokens, temp=0, -ngl 99 (full GPU offload)",
|
|
"results": [
|
|
{
|
|
"model": "Bonsai-8B-1bit",
|
|
"file": "Bonsai-8B-Q1_0.gguf",
|
|
"found": false,
|
|
"disk_size_gb": 1.15,
|
|
"est_gpu_gb": 1.15,
|
|
"tok_per_sec": null,
|
|
"avg": 70.5,
|
|
"gsm8k": 88.0,
|
|
"mmlu_r": 65.7,
|
|
"musr": 50.0,
|
|
"he_plus": 73.8,
|
|
"ifeval": 79.8,
|
|
"bfcl": 65.7,
|
|
"quality_note": "Published Prism ML technical report (EvalScope v1.4.2). M4 Pro Metal: 85 tok/s.",
|
|
"platform_reference": "M4 Pro (Metal), 48 GB — NOT M1 (see live-run file for actual M1 measurements)"
|
|
},
|
|
{
|
|
"model": "Bonsai-4B-1bit",
|
|
"file": "Bonsai-4B-Q1_0.gguf",
|
|
"found": false,
|
|
"disk_size_gb": 0.57,
|
|
"est_gpu_gb": 0.57,
|
|
"tok_per_sec": null,
|
|
"avg": 67.5,
|
|
"gsm8k": 84.0,
|
|
"mmlu_r": 62.0,
|
|
"quality_note": "Estimated from Bonsai size-quality trend — full eval needed.",
|
|
"platform_reference": "M4 Pro (Metal) published: 136 tok/s"
|
|
},
|
|
{
|
|
"model": "Bonsai-1.7B-1bit",
|
|
"file": "Bonsai-1.7B-Q1_0.gguf",
|
|
"found": false,
|
|
"disk_size_gb": 0.24,
|
|
"est_gpu_gb": 0.24,
|
|
"tok_per_sec": null,
|
|
"avg": 62.0,
|
|
"gsm8k": 78.0,
|
|
"mmlu_r": 56.0,
|
|
"quality_note": "Estimated from Bonsai size-quality trend — full eval needed.",
|
|
"platform_reference": "M4 Pro (Metal) published: 250 tok/s"
|
|
},
|
|
{
|
|
"model": "Qwen3-8B-Q4_0",
|
|
"file": "Qwen3-8B-Q4_0.gguf",
|
|
"found": false,
|
|
"disk_size_gb": 4.70,
|
|
"est_gpu_gb": 4.70,
|
|
"tok_per_sec": null,
|
|
"avg": 79.3,
|
|
"gsm8k": 93.0,
|
|
"mmlu_r": 83.0,
|
|
"source": "Alibaba Qwen 3 8B model card (Q4_0 baseline)"
|
|
},
|
|
{
|
|
"model": "Qwen3-4B-Q4_0",
|
|
"file": "Qwen3-4B-Q4_0.gguf",
|
|
"found": false,
|
|
"disk_size_gb": 2.40,
|
|
"est_gpu_gb": 2.40,
|
|
"tok_per_sec": null,
|
|
"avg": 76.0,
|
|
"gsm8k": 90.0,
|
|
"mmlu_r": 80.0,
|
|
"source": "Approximated from Qwen3 4B model card metrics"
|
|
},
|
|
{
|
|
"model": "Qwen3-1.7B-Q4_0",
|
|
"file": "Qwen3-1.7B-Q4_0.gguf",
|
|
"found": false,
|
|
"disk_size_gb": 1.00,
|
|
"est_gpu_gb": 1.00,
|
|
"tok_per_sec": null,
|
|
"avg": 71.0,
|
|
"gsm8k": 87.0,
|
|
"mmlu_r": 74.0,
|
|
"source": "Approximated from Qwen3 1.7B model card metrics"
|
|
}
|
|
]
|
|
}
|