27 lines
872 B
YAML
27 lines
872 B
YAML
|
|
# Eval Config — lm-evaluation-harness
|
||
|
|
# Replaces: autolora/eval/run_eval.py (300 lines)
|
||
|
|
#
|
||
|
|
# Usage:
|
||
|
|
# lm_eval --model local-completions \
|
||
|
|
# --model_args model=timmy:v0.1-q4,base_url=http://localhost:11434/v1 \
|
||
|
|
# --tasks hellaswag,truthfulqa_mc2,arc_challenge \
|
||
|
|
# --output_path training/evals_archive/
|
||
|
|
#
|
||
|
|
# For custom Timmy-specific evals, use the vibes check (see Makefile).
|
||
|
|
# The vibes check is manual by design — you read the output and judge.
|
||
|
|
|
||
|
|
# Standard benchmarks to run against each model version
|
||
|
|
benchmarks:
|
||
|
|
- hellaswag # Common sense reasoning
|
||
|
|
- truthfulqa_mc2 # Honesty / factuality
|
||
|
|
- arc_challenge # Science reasoning
|
||
|
|
- winogrande # Coreference resolution
|
||
|
|
|
||
|
|
# Models to compare
|
||
|
|
models:
|
||
|
|
baseline: hermes3:latest
|
||
|
|
candidate: timmy:v0.1-q4
|
||
|
|
|
||
|
|
# Ollama endpoint
|
||
|
|
endpoint: http://localhost:11434/v1
|