timmy-config/training/eval-tasks.yaml

# Eval Config — lm-evaluation-harness
# Replaces: autolora/eval/run_eval.py (300 lines)
#
# Usage:
#   lm_eval --model local-completions \
#           --model_args model=timmy:v0.1-q4,base_url=http://localhost:11434/v1 \
#           --tasks hellaswag,truthfulqa_mc2,arc_challenge \
#           --output_path training/evals_archive/
#
# For custom Timmy-specific evals, use the vibes check (see Makefile).
# The vibes check is manual by design — you read the output and judge.

# Standard benchmarks to run against each model version
benchmarks:
  - hellaswag          # Common sense reasoning
  - truthfulqa_mc2     # Honesty / factuality
  - arc_challenge      # Science reasoning
  - winogrande         # Coreference resolution

# Models to compare
models:
  baseline: hermes3:latest
  candidate: timmy:v0.1-q4

# Ollama endpoint
endpoint: http://localhost:11434/v1