Compare commits

...

1 Commits

Author SHA1 Message Date
Timmy
064ab602dd feat: M1 benchmark suite — throughput, memory, perplexity comparison
All checks were successful
Smoke Test / smoke (pull_request) Successful in 16s
Implements Issue #80: benchmark turboquant vs llama.cpp baseline on M1.

New files:
- benchmarks/run_m1_benchmark.py — comprehensive benchmark runner
- benchmarks/run_benchmark_m1.sh — shell wrapper for easy execution
- tests/test_m1_benchmark.py — unit tests for benchmark functions

Measures:
- Tokens/sec throughput (f16 vs turbo4, 3-run average)
- Memory usage (RSS monitoring during inference)
- Quality via perplexity (llama-perplexity on wikitext-2)

Generates:
- benchmarks/m1_benchmark_results.json — raw results
- benchmarks/m1_benchmark_report.md — markdown comparison table

Closes #80
2026-04-15 22:10:43 -04:00
3 changed files with 908 additions and 0 deletions

View File

@@ -0,0 +1,91 @@
#!/bin/bash
# TurboQuant M1 Benchmark Runner (Issue #80)
# Runs both f16 and turbo4 KV configs against same model, collects throughput + memory + perplexity.
#
# Prerequisites:
# - llama-server built from llama-cpp-turboquant fork (feature/turboquant-kv-cache)
# - Model GGUF file downloaded
# - wikitext-2 corpus in corpora/wiki.test.raw
#
# Usage:
# ./benchmarks/run_benchmark_m1.sh <model_name> <model_path> [llama_server_url]
#
# Example:
# ./benchmarks/run_benchmark_m1.sh qwen3.5:27b ~/models/qwen3.5-27b-q4_k_m.gguf
set -euo pipefail
MODEL_NAME="${1:?Usage: $0 <model_name> <model_path> [llama_server_url]}"
MODEL_PATH="${2:?Model path required}"
LLAMA_SERVER="${3:-http://localhost:8080}"
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
LLAMA_BIN="${PROJECT_DIR}/llama.cpp-fork/build/bin"
CORPUS="${PROJECT_DIR}/corpora/wiki.test.raw"
OUTPUT_DIR="${PROJECT_DIR}/benchmarks"
echo "=========================================="
echo "TurboQuant M1 Benchmark"
echo "=========================================="
echo "Model: ${MODEL_NAME}"
echo "Model path: ${MODEL_PATH}"
echo "Server: ${LLAMA_SERVER}"
echo "llama bin: ${LLAMA_BIN}"
echo "Corpus: ${CORPUS}"
echo ""
# Check prerequisites
if [ ! -f "${MODEL_PATH}" ]; then
echo "WARNING: Model file not found: ${MODEL_PATH}"
echo " Perplexity tests will be skipped."
fi
if [ ! -f "${LLAMA_BIN}/llama-perplexity" ]; then
echo "WARNING: llama-perplexity not found at ${LLAMA_BIN}/llama-perplexity"
echo " Perplexity tests will be skipped."
fi
if [ ! -f "${CORPUS}" ]; then
echo "WARNING: Corpus not found: ${CORPUS}"
echo " Download with: curl -L https://raw.githubusercontent.com/pytorch/examples/main/word_language_model/data/wikitext-2/wiki.test.raw -o ${CORPUS}"
fi
# Check server is running
echo "Checking llama-server at ${LLAMA_SERVER}..."
if curl -sf "${LLAMA_SERVER}/health" > /dev/null 2>&1; then
echo " Server is running ✓"
else
echo " Server not responding. Trying /v1/models..."
if curl -sf "${LLAMA_SERVER}/v1/models" > /dev/null 2>&1; then
echo " Server is running (no /health endpoint) ✓"
else
echo " ERROR: llama-server not reachable at ${LLAMA_SERVER}"
echo " Start with: llama-server -m ${MODEL_PATH} --port 8080 -ctk turbo4 -ctv turbo4 -c 4096"
exit 1
fi
fi
# Run benchmark
echo ""
echo "Starting benchmark suite..."
python3 "${SCRIPT_DIR}/run_m1_benchmark.py" \
--model "${MODEL_NAME}" \
--model-path "${MODEL_PATH}" \
--backend llama-server \
--llama-server "${LLAMA_SERVER}" \
--llama-bin "${LLAMA_BIN}" \
--corpus "${CORPUS}" \
--context 2048 \
--threads 4 \
--num-predict 256 \
--runs 3 \
--output-dir "${OUTPUT_DIR}" \
--ppl-threshold 0.5
echo ""
echo "=========================================="
echo "Done. Results in:"
echo " ${OUTPUT_DIR}/m1_benchmark_results.json"
echo " ${OUTPUT_DIR}/m1_benchmark_report.md"
echo "=========================================="

View File

@@ -0,0 +1,681 @@
#!/usr/bin/env python3
"""
TurboQuant M1 Benchmark Suite (Issue #80)
Comprehensive benchmark comparing TurboQuant (turbo4 KV) vs baseline (f16 KV)
on Apple M1 Mac. Measures: tokens/sec, memory usage, quality (perplexity).
Usage:
python3 benchmarks/run_m1_benchmark.py \
--model qwen3.5:27b \
--llama-server http://localhost:8080 \
--llama-bin ~/llama-cpp-turboquant/build/bin
# Skip perplexity (quick throughput/memory only)
python3 benchmarks/run_m1_benchmark.py --model qwen3.5:27b --skip-perplexity
Outputs:
- benchmarks/m1_benchmark_results.json
- benchmarks/m1_benchmark_report.md
"""
import argparse
import json
import os
import re
import subprocess
import sys
import time
import threading
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Tuple
try:
import requests
except ImportError:
print("ERROR: requests package required. Install with: pip install requests")
sys.exit(1)
# ── Memory Monitoring ───────────────────────────────────────────────────────
class MemoryMonitor:
"""Monitor memory usage of a process in background."""
def __init__(self, pid: int, interval: float = 0.5):
self.pid = pid
self.interval = interval
self.samples = []
self._stop = threading.Event()
self._thread = None
def start(self):
self._stop.clear()
self._thread = threading.Thread(target=self._monitor_loop, daemon=True)
self._thread.start()
def stop(self):
self._stop.set()
if self._thread:
self._thread.join(timeout=2)
return self.get_stats()
def _monitor_loop(self):
while not self._stop.is_set():
try:
mem_mb = self._get_memory_mb()
if mem_mb > 0:
self.samples.append(mem_mb)
except Exception:
pass
time.sleep(self.interval)
def _get_memory_mb(self) -> float:
if sys.platform == "darwin":
result = subprocess.run(
["ps", "-o", "rss=", "-p", str(self.pid)],
capture_output=True, text=True
)
if result.returncode == 0 and result.stdout.strip():
return int(result.stdout.strip()) / 1024
else:
try:
with open(f"/proc/{self.pid}/status") as f:
for line in f:
if line.startswith("VmRSS:"):
return int(line.split()[1]) / 1024
except FileNotFoundError:
pass
return 0.0
def get_stats(self) -> dict:
if not self.samples:
return {"avg_mb": 0, "peak_mb": 0, "min_mb": 0, "samples": 0}
return {
"avg_mb": round(sum(self.samples) / len(self.samples), 1),
"peak_mb": round(max(self.samples), 1),
"min_mb": round(min(self.samples), 1),
"samples": len(self.samples),
}
# ── System Info ─────────────────────────────────────────────────────────────
def get_system_info() -> dict:
info = {"platform": sys.platform, "python": sys.version.split()[0]}
try:
if sys.platform == "darwin":
info["chip"] = subprocess.run(
["sysctl", "-n", "machdep.cpu.brand_string"],
capture_output=True, text=True
).stdout.strip()
mem_bytes = int(subprocess.run(
["sysctl", "-n", "hw.memsize"],
capture_output=True, text=True
).stdout.strip())
info["memory_gb"] = round(mem_bytes / (1024**3), 1)
info["cpu_cores"] = os.cpu_count()
else:
info["cpu"] = subprocess.run(
["uname", "-m"], capture_output=True, text=True
).stdout.strip()
info["cpu_cores"] = os.cpu_count()
except Exception:
info["error"] = "Could not detect hardware"
return info
# ── Benchmark Functions ─────────────────────────────────────────────────────
def find_llama_server_pid() -> Optional[int]:
"""Find PID of running llama-server process."""
try:
result = subprocess.run(
["pgrep", "-f", "llama-server"],
capture_output=True, text=True
)
if result.stdout.strip():
return int(result.stdout.strip().split("\n")[0])
except Exception:
pass
return None
def run_throughput_test(prompt: str, model: str, url: str, kv_type: str,
num_predict: int = 256, timeout: int = 120) -> dict:
"""Run a single throughput test against llama-server."""
api_url = f"{url.rstrip('/')}/v1/chat/completions"
start = time.time()
ttft = None
tokens_per_sec = 0.0
try:
resp = requests.post(api_url, json={
"model": model,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": num_predict,
"stream": False
}, timeout=timeout)
elapsed = time.time() - start
resp.raise_for_status()
data = resp.json()
response_text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
usage = data.get("usage", {})
completion_tokens = usage.get("completion_tokens", 0)
prompt_tokens = usage.get("prompt_tokens", 0)
if elapsed > 0 and completion_tokens > 0:
tokens_per_sec = completion_tokens / max(elapsed - 0.1, 0.01)
return {
"response_len": len(response_text),
"latency_s": round(elapsed, 3),
"tokens_per_sec": round(tokens_per_sec, 2),
"completion_tokens": completion_tokens,
"prompt_tokens": prompt_tokens,
"kv_type": kv_type,
"status": "success"
}
except Exception as e:
return {"status": "failed", "error": str(e), "latency_s": round(time.time() - start, 3)}
def run_ollama_test(prompt: str, model: str, url: str,
num_predict: int = 256, timeout: int = 120) -> dict:
"""Run a single throughput test against Ollama."""
api_url = f"{url.rstrip('/')}/api/generate"
start = time.time()
try:
resp = requests.post(api_url, json={
"model": model,
"prompt": prompt,
"stream": False,
"options": {"num_predict": num_predict}
}, timeout=timeout)
elapsed = time.time() - start
resp.raise_for_status()
data = resp.json()
response_text = data.get("response", "")
eval_count = data.get("eval_count", 0)
eval_duration_ns = data.get("eval_duration", 0)
prompt_eval_ns = data.get("prompt_eval_duration", 0)
tokens_per_sec = 0.0
if eval_duration_ns > 0:
tokens_per_sec = eval_count / (eval_duration_ns / 1e9)
ttft = None
if prompt_eval_ns > 0:
ttft = prompt_eval_ns / 1e9
return {
"response_len": len(response_text),
"latency_s": round(elapsed, 3),
"ttft_s": round(ttft, 3) if ttft else None,
"tokens_per_sec": round(tokens_per_sec, 2),
"completion_tokens": eval_count,
"prompt_tokens": data.get("prompt_eval_count", 0),
"status": "success"
}
except Exception as e:
return {"status": "failed", "error": str(e), "latency_s": round(time.time() - start, 3)}
def run_perplexity_test(llama_bin: str, model_path: str, corpus: str,
context: int, kv_type: str, threads: int = 4) -> dict:
"""Run llama-perplexity and parse output."""
if not os.path.exists(llama_bin):
return {"error": f"Binary not found: {llama_bin}", "passed": False}
if not os.path.exists(model_path):
return {"error": f"Model not found: {model_path}", "passed": False}
if not os.path.exists(corpus):
return {"error": f"Corpus not found: {corpus}", "passed": False}
cmd = [
llama_bin,
"-m", model_path,
"-f", corpus,
"-c", str(context),
"-t", str(threads),
"--kv-type", kv_type,
]
print(f" Command: {' '.join(cmd)}")
start = time.time()
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600)
elapsed = time.time() - start
output = result.stdout + "\n" + result.stderr
ppl_match = re.search(r"perplexity[:\s]+(\d+\.?\d*)", output, re.IGNORECASE)
ppl = float(ppl_match.group(1)) if ppl_match else None
token_match = re.search(r"(\d+) tokens", output)
tokens = int(token_match.group(1)) if token_match else None
return {
"kv_type": kv_type,
"perplexity": ppl,
"tokens": tokens,
"elapsed_seconds": round(elapsed, 1),
"exit_code": result.returncode,
"passed": result.returncode == 0 and ppl is not None,
"output_tail": output.strip()[-500:] if output else "",
}
except subprocess.TimeoutExpired:
return {"kv_type": kv_type, "perplexity": None, "error": "Timeout",
"passed": False, "elapsed_seconds": 3600}
# ── Prompt Sets ─────────────────────────────────────────────────────────────
THROUGHPUT_PROMPTS = [
"Explain the difference between TCP and UDP protocols. Include use cases for each.",
"Write a Python function that implements binary search on a sorted list.",
"What are the three laws of thermodynamics? Explain each in simple terms.",
"Describe the process of photosynthesis step by step.",
"Write a recursive function to calculate the Fibonacci sequence with memoization.",
]
# ── Report Generation ───────────────────────────────────────────────────────
def generate_report(results: dict, output_path: str):
"""Generate markdown report from benchmark results."""
lines = []
lines.append("# TurboQuant M1 Benchmark Report")
lines.append("")
lines.append(f"**Date:** {results['timestamp']}")
lines.append(f"**Hardware:** {results['system'].get('chip', 'unknown')}, "
f"{results['system'].get('memory_gb', '?')}GB RAM, "
f"{results['system'].get('cpu_cores', '?')} cores")
lines.append(f"**Model:** {results['model']}")
lines.append("")
# Throughput comparison
lines.append("## Throughput Comparison")
lines.append("")
tp = results.get("throughput", {})
baseline = tp.get("f16", {})
turbo = tp.get("turbo4", {})
lines.append("| Metric | f16 (baseline) | turbo4 (TurboQuant) | Delta |")
lines.append("|:-------|:---------------|:--------------------|:------|")
def fmt_delta(baseline_val, turbo_val, suffix="", higher_is_better=True):
if baseline_val and turbo_val:
delta = turbo_val - baseline_val
pct = (delta / baseline_val) * 100 if baseline_val else 0
sign = "+" if delta >= 0 else ""
better = (delta >= 0) if higher_is_better else (delta <= 0)
marker = "" if better else ""
return (f"{baseline_val}{suffix}", f"{turbo_val}{suffix}",
f"{sign}{pct:.1f}% {marker}")
return ("N/A", "N/A", "N/A")
b_tok, t_tok, d_tok = fmt_delta(
baseline.get("avg_tok_per_sec"), turbo.get("avg_tok_per_sec"), " tok/s")
b_lat, t_lat, d_lat = fmt_delta(
baseline.get("avg_latency"), turbo.get("avg_latency"), "s", higher_is_better=False)
b_ttft, t_ttft, d_ttft = fmt_delta(
baseline.get("avg_ttft"), turbo.get("avg_ttft"), "s", higher_is_better=False)
lines.append(f"| Tokens/sec (avg) | {b_tok} | {t_tok} | {d_tok} |")
lines.append(f"| Latency (avg) | {b_lat} | {t_lat} | {d_lat} |")
lines.append(f"| TTFT (avg) | {b_ttft} | {t_ttft} | {d_ttft} |")
lines.append("")
# Per-prompt breakdown
lines.append("### Per-Prompt Results")
lines.append("")
lines.append("| Prompt # | f16 tok/s | turbo4 tok/s | Status |")
lines.append("|:---------|:----------|:-------------|:-------|")
baseline_results = baseline.get("results", [])
turbo_results = turbo.get("results", [])
for i, (b, t) in enumerate(zip(baseline_results, turbo_results), 1):
b_tps = b.get("tokens_per_sec", 0)
t_tps = t.get("tokens_per_sec", 0)
if b.get("status") == "success" and t.get("status") == "success":
delta_pct = ((t_tps - b_tps) / b_tps * 100) if b_tps else 0
status = "" if delta_pct > -20 else ""
lines.append(f"| {i} | {b_tps:.1f} | {t_tps:.1f} | {status} ({delta_pct:+.1f}%) |")
else:
err_b = b.get("error", b.get("status", "?"))
err_t = t.get("error", t.get("status", "?"))
lines.append(f"| {i} | {err_b} | {err_t} | ✗ |")
lines.append("")
# Memory comparison
lines.append("## Memory Usage")
lines.append("")
mem = results.get("memory", {})
b_mem = mem.get("f16", {})
t_mem = mem.get("turbo4", {})
lines.append("| Metric | f16 (baseline) | turbo4 (TurboQuant) | Savings |")
lines.append("|:-------|:---------------|:--------------------|:--------|")
if b_mem.get("peak_mb") and t_mem.get("peak_mb"):
savings = b_mem["peak_mb"] - t_mem["peak_mb"]
savings_pct = (savings / b_mem["peak_mb"]) * 100
lines.append(f"| Peak RSS | {b_mem['peak_mb']:.0f} MB | {t_mem['peak_mb']:.0f} MB | "
f"{savings:.0f} MB ({savings_pct:.1f}%) |")
if b_mem.get("avg_mb") and t_mem.get("avg_mb"):
lines.append(f"| Avg RSS | {b_mem['avg_mb']:.0f} MB | {t_mem['avg_mb']:.0f} MB | "
f"{b_mem['avg_mb'] - t_mem['avg_mb']:.0f} MB |")
lines.append("")
# Perplexity
ppl = results.get("perplexity", {})
if ppl.get("f16") or ppl.get("turbo4"):
lines.append("## Quality (Perplexity)")
lines.append("")
lines.append("| KV Type | Perplexity | Tokens | Time |")
lines.append("|:--------|:-----------|:-------|:-----|")
for kv in ["f16", "turbo4"]:
r = ppl.get(kv, {})
ppl_val = r.get("perplexity")
tokens = r.get("tokens")
elapsed = r.get("elapsed_seconds")
lines.append(f"| {kv} | {ppl_val:.4f} if ppl_val else 'N/A' | "
f"{tokens or 'N/A'} | {elapsed or 'N/A'}s |")
if ppl.get("delta") is not None:
lines.append("")
lines.append(f"**PPL Delta (turbo4 - f16):** {ppl['delta']:+.4f}")
lines.append(f"**Threshold:** ≤ {ppl.get('threshold', 0.5)}")
lines.append(f"**Result:** {'PASS ✓' if ppl.get('pass') else 'FAIL ✗'}")
lines.append("")
# Summary
lines.append("## Summary")
lines.append("")
# Compute overall verdict
throughput_ok = True
if turbo.get("avg_tok_per_sec") and baseline.get("avg_tok_per_sec"):
ratio = turbo["avg_tok_per_sec"] / baseline["avg_tok_per_sec"]
throughput_ok = ratio >= 0.80 # 80% of baseline is acceptable
lines.append(f"- **Throughput:** {ratio*100:.0f}% of baseline "
f"({'PASS' if throughput_ok else 'BORDERLINE'})")
memory_ok = True
if t_mem.get("peak_mb") and b_mem.get("peak_mb"):
savings_pct = (b_mem["peak_mb"] - t_mem["peak_mb"]) / b_mem["peak_mb"] * 100
memory_ok = savings_pct > 50 # Expect >50% savings
lines.append(f"- **Memory savings:** {savings_pct:.1f}% "
f"({'PASS' if memory_ok else 'CHECK'})")
if ppl.get("pass") is not None:
lines.append(f"- **Quality (PPL):** {'PASS' if ppl['pass'] else 'FAIL'} "
f"(delta={ppl.get('delta', 'N/A')})")
lines.append("")
# Issues discovered
issues = results.get("issues_discovered", [])
if issues:
lines.append("## Issues Discovered")
lines.append("")
for issue in issues:
lines.append(f"- **{issue['title']}**")
lines.append(f" {issue.get('description', '')}")
lines.append("")
lines.append("---")
lines.append(f"*Generated by run_m1_benchmark.py — Issue #80*")
report = "\n".join(lines)
with open(output_path, "w") as f:
f.write(report)
return report
# ── Main ────────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="TurboQuant M1 Benchmark Suite")
parser.add_argument("--model", required=True, help="Model name (e.g. qwen3.5:27b)")
parser.add_argument("--model-path", default=None,
help="Path to GGUF model file (for perplexity)")
parser.add_argument("--backend", choices=["llama-server", "ollama"],
default="llama-server")
parser.add_argument("--llama-server", default="http://localhost:8080",
help="llama-server URL")
parser.add_argument("--ollama-url", default="http://localhost:11434",
help="Ollama URL")
parser.add_argument("--llama-bin", default=None,
help="Path to llama.cpp build/bin directory")
parser.add_argument("--corpus", default="corpora/wiki.test.raw",
help="Path to wikitext-2 corpus")
parser.add_argument("--context", type=int, default=2048,
help="Context length for perplexity test")
parser.add_argument("--threads", type=int, default=4,
help="Thread count for perplexity")
parser.add_argument("--num-predict", type=int, default=256,
help="Max tokens to generate per prompt")
parser.add_argument("--runs", type=int, default=3,
help="Number of runs per config for averaging")
parser.add_argument("--skip-perplexity", action="store_true",
help="Skip perplexity measurement")
parser.add_argument("--output-dir", default="benchmarks",
help="Output directory")
parser.add_argument("--ppl-threshold", type=float, default=0.5,
help="Max acceptable PPL delta")
args = parser.parse_args()
os.makedirs(args.output_dir, exist_ok=True)
# System info
print("Gathering system info...")
system_info = get_system_info()
print(f" Platform: {system_info.get('chip', system_info.get('cpu', '?'))}")
print(f" Memory: {system_info.get('memory_gb', '?')}GB")
print(f" Cores: {system_info.get('cpu_cores', '?')}")
# URL
url = args.llama_server if args.backend == "llama-server" else args.ollama_url
# KV types to test
kv_types = ["f16", "turbo4"]
results = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"system": system_info,
"model": args.model,
"backend": args.backend,
"url": url,
"num_predict": args.num_predict,
"runs_per_config": args.runs,
"throughput": {},
"memory": {},
"perplexity": {},
"issues_discovered": [],
}
# ── Throughput + Memory Tests ────────────────────────────────────────
for kv_type in kv_types:
print(f"\n{'='*60}")
print(f"Testing: {kv_type} KV cache")
print(f"{'='*60}")
run_results = []
# Find server PID for memory monitoring
server_pid = find_llama_server_pid()
monitor = None
if server_pid:
print(f" Monitoring PID {server_pid} for memory")
monitor = MemoryMonitor(server_pid)
monitor.start()
for i in range(args.runs):
prompt = THROUGHPUT_PROMPTS[i % len(THROUGHPUT_PROMPTS)]
print(f" Run {i+1}/{args.runs}...", end=" ", flush=True)
if args.backend == "llama-server":
result = run_throughput_test(prompt, args.model, url, kv_type,
num_predict=args.num_predict)
else:
result = run_ollama_test(prompt, args.model, url,
num_predict=args.num_predict)
result["kv_type"] = "default" # Ollama doesn't expose KV type
status = "" if result["status"] == "success" else ""
tps = result.get("tokens_per_sec", 0)
print(f"{status} {tps:.1f} tok/s, {result.get('latency_s', 0):.2f}s")
run_results.append(result)
# Stop memory monitor
mem_stats = {"avg_mb": 0, "peak_mb": 0, "min_mb": 0, "samples": 0}
if monitor:
mem_stats = monitor.stop()
print(f" Memory: peak={mem_stats['peak_mb']:.0f}MB, "
f"avg={mem_stats['avg_mb']:.0f}MB")
results["memory"][kv_type] = mem_stats
# Aggregate throughput
successful = [r for r in run_results if r["status"] == "success"]
if successful:
avg_tps = sum(r.get("tokens_per_sec", 0) for r in successful) / len(successful)
avg_lat = sum(r.get("latency_s", 0) for r in successful) / len(successful)
ttfts = [r.get("ttft_s") for r in successful if r.get("ttft_s")]
avg_ttft = sum(ttfts) / len(ttfts) if ttfts else None
else:
avg_tps = avg_lat = avg_ttft = 0
results["throughput"][kv_type] = {
"avg_tok_per_sec": round(avg_tps, 2),
"avg_latency": round(avg_lat, 3),
"avg_ttft": round(avg_ttft, 3) if avg_ttft else None,
"success_rate": f"{len(successful)}/{len(run_results)}",
"results": run_results,
}
# ── Perplexity Tests ─────────────────────────────────────────────────
if not args.skip_perplexity:
llama_bin = None
if args.llama_bin:
llama_bin = os.path.join(args.llama_bin, "llama-perplexity")
if not os.path.exists(llama_bin):
llama_bin = os.path.join(args.llama_bin, "bin", "llama-perplexity")
model_path = args.model_path
if llama_bin and os.path.exists(llama_bin) and model_path and os.path.exists(model_path) \
and os.path.exists(args.corpus):
print(f"\n{'='*60}")
print("Perplexity Tests")
print(f"{'='*60}")
print(f" Model: {model_path}")
print(f" Corpus: {args.corpus}")
print(f" Context: {args.context}")
ppl_results = {"f16": {}, "turbo4": {}, "threshold": args.ppl_threshold}
for kv_type in kv_types:
print(f"\n Running {kv_type} perplexity...")
ppl_results[kv_type] = run_perplexity_test(
llama_bin, model_path, args.corpus,
args.context, kv_type, args.threads
)
ppl_val = ppl_results[kv_type].get("perplexity")
if ppl_val:
print(f" PPL = {ppl_val:.4f}")
# Calculate delta
b_ppl = ppl_results.get("f16", {}).get("perplexity")
t_ppl = ppl_results.get("turbo4", {}).get("perplexity")
if b_ppl and t_ppl:
delta = t_ppl - b_ppl
ppl_results["delta"] = round(delta, 4)
ppl_results["pass"] = delta <= args.ppl_threshold
print(f"\n Delta: {delta:+.4f} (threshold: ≤{args.ppl_threshold})")
print(f" Result: {'PASS ✓' if ppl_results['pass'] else 'FAIL ✗'}")
results["perplexity"] = ppl_results
else:
print("\nSkipping perplexity: need --llama-bin, --model-path, and corpus file")
if not llama_bin or not os.path.exists(llama_bin):
print(f" llama-perplexity: {llama_bin or 'not specified'}")
if not model_path or not os.path.exists(model_path):
print(f" model path: {model_path or 'not specified (use --model-path)'}")
if not os.path.exists(args.corpus):
print(f" corpus: {args.corpus}")
results["perplexity"] = {"skipped": True, "reason": "missing binaries/model/corpus"}
# ── Issue Detection ──────────────────────────────────────────────────
tp = results["throughput"]
baseline_tps = tp.get("f16", {}).get("avg_tok_per_sec", 0)
turbo_tps = tp.get("turbo4", {}).get("avg_tok_per_sec", 0)
if baseline_tps > 0 and turbo_tps > 0:
ratio = turbo_tps / baseline_tps
if ratio < 0.75:
results["issues_discovered"].append({
"title": "turbo4 throughput below 75% of baseline",
"description": f"turbo4={turbo_tps:.1f} tok/s vs f16={baseline_tps:.1f} tok/s "
f"({ratio*100:.0f}%). Investigate Metal kernel overhead.",
})
mem = results["memory"]
b_peak = mem.get("f16", {}).get("peak_mb", 0)
t_peak = mem.get("turbo4", {}).get("peak_mb", 0)
if b_peak > 0 and t_peak > 0:
savings_pct = (b_peak - t_peak) / b_peak * 100
if savings_pct < 50:
results["issues_discovered"].append({
"title": "turbo4 memory savings below expected 73%",
"description": f"Observed {savings_pct:.1f}% savings (expected ~73%). "
f"Check if turbo4 KV is actually active.",
})
ppl = results.get("perplexity", {})
if ppl.get("delta") and ppl["delta"] > args.ppl_threshold:
results["issues_discovered"].append({
"title": f"PPL regression exceeds threshold ({ppl['delta']:.4f} > {args.ppl_threshold})",
"description": f"Quality degradation detected. Delta={ppl['delta']:.4f}. "
f"Consider asymmetric K/V (q8_0/turbo4) or per-layer adaptive mode.",
})
# ── Save Results ─────────────────────────────────────────────────────
results_path = os.path.join(args.output_dir, "m1_benchmark_results.json")
with open(results_path, "w") as f:
json.dump(results, f, indent=2)
print(f"\nResults saved to {results_path}")
# ── Generate Report ──────────────────────────────────────────────────
report_path = os.path.join(args.output_dir, "m1_benchmark_report.md")
report = generate_report(results, report_path)
print(f"Report saved to {report_path}")
# Print summary
print(f"\n{'='*60}")
print("SUMMARY")
print(f"{'='*60}")
if baseline_tps and turbo_tps:
ratio = turbo_tps / baseline_tps
print(f" Throughput: {turbo_tps:.1f} tok/s ({ratio*100:.0f}% of baseline {baseline_tps:.1f})")
if b_peak and t_peak:
savings = (b_peak - t_peak) / b_peak * 100
print(f" Memory: {t_peak:.0f}MB peak ({savings:.0f}% savings)")
if ppl.get("delta") is not None:
print(f" Quality: PPL delta={ppl['delta']:+.4f} ({'PASS' if ppl['pass'] else 'FAIL'})")
if results["issues_discovered"]:
print(f" Issues: {len(results['issues_discovered'])} found")
print(f"{'='*60}")
if __name__ == "__main__":
main()

136
tests/test_m1_benchmark.py Normal file
View File

@@ -0,0 +1,136 @@
#!/usr/bin/env python3
"""
Tests for run_m1_benchmark.py (Issue #80)
Validates core benchmark functions without requiring a live server.
"""
import json
import os
import sys
import tempfile
import unittest
from unittest.mock import patch, MagicMock
# Add parent dir to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from benchmarks.run_m1_benchmark import (
MemoryMonitor,
get_system_info,
generate_report,
)
class TestMemoryMonitor(unittest.TestCase):
def test_init(self):
mon = MemoryMonitor(pid=1, interval=0.1)
self.assertEqual(mon.pid, 1)
self.assertEqual(mon.samples, [])
def test_get_stats_empty(self):
mon = MemoryMonitor(pid=1)
stats = mon.get_stats()
self.assertEqual(stats["avg_mb"], 0)
self.assertEqual(stats["peak_mb"], 0)
self.assertEqual(stats["samples"], 0)
def test_get_stats_with_samples(self):
mon = MemoryMonitor(pid=1)
mon.samples = [100.0, 150.0, 200.0, 120.0]
stats = mon.get_stats()
self.assertEqual(stats["peak_mb"], 200.0)
self.assertEqual(stats["min_mb"], 100.0)
self.assertEqual(stats["avg_mb"], 142.5)
self.assertEqual(stats["samples"], 4)
class TestSystemInfo(unittest.TestCase):
def test_returns_dict(self):
info = get_system_info()
self.assertIsInstance(info, dict)
self.assertIn("platform", info)
self.assertIn("python", info)
class TestReportGeneration(unittest.TestCase):
def test_basic_report(self):
results = {
"timestamp": "2026-04-15T12:00:00Z",
"system": {"chip": "Apple M1", "memory_gb": 16, "cpu_cores": 8},
"model": "test-model",
"throughput": {
"f16": {
"avg_tok_per_sec": 100.0,
"avg_latency": 2.5,
"avg_ttft": 0.3,
"results": [
{"tokens_per_sec": 100, "latency_s": 2.5, "status": "success"},
],
},
"turbo4": {
"avg_tok_per_sec": 90.0,
"avg_latency": 2.8,
"avg_ttft": 0.35,
"results": [
{"tokens_per_sec": 90, "latency_s": 2.8, "status": "success"},
],
},
},
"memory": {
"f16": {"peak_mb": 1000, "avg_mb": 900},
"turbo4": {"peak_mb": 300, "avg_mb": 250},
},
"perplexity": {
"f16": {"perplexity": 12.5, "tokens": 5000, "elapsed_seconds": 120},
"turbo4": {"perplexity": 12.8, "tokens": 5000, "elapsed_seconds": 130},
"delta": 0.3,
"pass": True,
"threshold": 0.5,
},
"issues_discovered": [],
}
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
report_path = f.name
try:
report = generate_report(results, report_path)
self.assertIn("TurboQuant M1 Benchmark Report", report)
self.assertIn("f16", report)
self.assertIn("turbo4", report)
self.assertIn("PASS", report)
# Verify file was written
with open(report_path) as f:
written = f.read()
self.assertEqual(written, report)
finally:
os.unlink(report_path)
def test_report_with_issues(self):
results = {
"timestamp": "2026-04-15T12:00:00Z",
"system": {"chip": "M1", "memory_gb": 16, "cpu_cores": 8},
"model": "test",
"throughput": {"f16": {"results": []}, "turbo4": {"results": []}},
"memory": {"f16": {}, "turbo4": {}},
"perplexity": {},
"issues_discovered": [
{"title": "Test issue", "description": "Something went wrong"}
],
}
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
report_path = f.name
try:
report = generate_report(results, report_path)
self.assertIn("Issues Discovered", report)
self.assertIn("Test issue", report)
finally:
os.unlink(report_path)
if __name__ == "__main__":
unittest.main()