docs: Document Ollama perplexity limitation — no logprob support (closes #63 )

Ollama lacks token logprob API, so true perplexity cannot be measured via the Ollama backend. Added warning to run_benchmarks.py docstring directing users to run_perplexity.py (llama-perplexity binary) for real PPL measurement with --logprobs support.
2026-04-14 23:23:38 -04:00
4 changed files with 9 additions and 144 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +0,0 @@
-build/
-*.pyc
-__pycache__/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,36 +0,0 @@
-cmake_minimum_required(VERSION 3.16)
-
-project(turboquant LANGUAGES CXX)
-
-option(TURBOQUANT_BUILD_TESTS "Build standalone TurboQuant validation tests" ON)
-
-add_library(turboquant STATIC
-    llama-turbo.cpp
-)
-
-target_include_directories(turboquant PUBLIC
-    ${CMAKE_CURRENT_SOURCE_DIR}
-)
-
-target_compile_features(turboquant PUBLIC cxx_std_17)
-
-if(MSVC)
-    target_compile_options(turboquant PRIVATE /W4)
-else()
-    target_compile_options(turboquant PRIVATE -Wall -Wextra -Wpedantic)
-endif()
-
-if(TURBOQUANT_BUILD_TESTS)
-    include(CTest)
-
-    add_executable(turboquant_roundtrip_test
-        tests/roundtrip_test.cpp
-    )
-    target_link_libraries(turboquant_roundtrip_test PRIVATE turboquant)
-    target_compile_features(turboquant_roundtrip_test PRIVATE cxx_std_17)
-
-    add_test(
-        NAME turboquant_roundtrip
-        COMMAND turboquant_roundtrip_test
-    )
-endif()
--- a/benchmarks/run_benchmarks.py
+++ b/benchmarks/run_benchmarks.py
@@ -5,8 +5,16 @@ TurboQuant Benchmarking Suite — Multi-Backend (Issue #29)
 Supports Ollama and llama-server backends with KV cache type configuration.
 Measures: TTFT, tokens/sec, latency, peak memory.

+IMPORTANT — Perplexity Limitation (Issue #63):
+  Ollama does NOT expose token logprobs. This means:
+  - True perplexity (PPL) cannot be measured via the Ollama backend
+  - The metrics here (tok/s, latency) are throughput proxies, not quality gates
+  - For real perplexity measurement, use benchmarks/run_perplexity.py
+    which calls llama-perplexity directly (--logprobs support)
+  - The pass criterion "PPL delta <= 0.5" cannot be validated via Ollama
+
 Usage:
-    # Ollama (default)
+    # Ollama (default) — throughput benchmarks only, NOT perplexity
    python3 benchmarks/run_benchmarks.py --backend ollama --model llama3

    # llama-server with turbo4 KV
--- a/tests/roundtrip_test.cpp
+++ b/tests/roundtrip_test.cpp
@@ -1,104 +0,0 @@
-#include "llama-turbo.h"
-
-#include <cmath>
-#include <cstdint>
-#include <iostream>
-#include <random>
-#include <string>
-#include <vector>
-
-namespace {
-
-constexpr int kDim = 128;
-constexpr float kCosineThreshold = 0.99f;
-constexpr float kZeroTolerance = 1.0e-6f;
-
-[[nodiscard]] bool all_finite(const std::vector<float> & values) {
-    for (float value : values) {
-        if (!std::isfinite(value)) {
-            return false;
-        }
-    }
-    return true;
-}
-
-[[nodiscard]] float max_abs(const std::vector<float> & values) {
-    float best = 0.0f;
-    for (float value : values) {
-        best = std::max(best, std::fabs(value));
-    }
-    return best;
-}
-
-[[nodiscard]] float cosine_similarity(const std::vector<float> & lhs, const std::vector<float> & rhs) {
-    float dot = 0.0f;
-    float lhs_norm = 0.0f;
-    float rhs_norm = 0.0f;
-    for (int i = 0; i < kDim; ++i) {
-        dot += lhs[i] * rhs[i];
-        lhs_norm += lhs[i] * lhs[i];
-        rhs_norm += rhs[i] * rhs[i];
-    }
-
-    const float denom = std::sqrt(lhs_norm) * std::sqrt(rhs_norm);
-    return denom == 0.0f ? 1.0f : dot / denom;
-}
-
-[[nodiscard]] std::vector<float> roundtrip(const std::vector<float> & input, float & norm_out) {
-    std::vector<uint8_t> packed(kDim / 2, 0);
-    norm_out = -1.0f;
-    polar_quant_encode_turbo4(input.data(), packed.data(), &norm_out, kDim);
-
-    std::vector<float> decoded(kDim, 0.0f);
-    polar_quant_decode_turbo4(packed.data(), decoded.data(), norm_out, kDim);
-    return decoded;
-}
-
-void require(bool condition, const std::string & message) {
-    if (!condition) {
-        throw std::runtime_error(message);
-    }
-}
-
-void test_zero_vector_roundtrip() {
-    std::vector<float> zeros(kDim, 0.0f);
-    float norm = -1.0f;
-    const auto decoded = roundtrip(zeros, norm);
-
-    require(norm == 0.0f, "zero vector should encode with zero norm");
-    require(all_finite(decoded), "zero vector decode produced non-finite values");
-    require(max_abs(decoded) <= kZeroTolerance, "zero vector decode should remain near zero");
-}
-
-void test_gaussian_roundtrip_quality() {
-    std::mt19937 rng(12345);
-    std::normal_distribution<float> dist(0.0f, 1.0f);
-
-    std::vector<float> input(kDim, 0.0f);
-    for (float & value : input) {
-        value = dist(rng);
-    }
-
-    float norm = -1.0f;
-    const auto decoded = roundtrip(input, norm);
-
-    require(norm > 0.0f, "random vector should encode with positive norm");
-    require(all_finite(decoded), "random vector decode produced non-finite values");
-
-    const float cosine = cosine_similarity(input, decoded);
-    require(cosine >= kCosineThreshold, "roundtrip cosine similarity below threshold");
-}
-
-}  // namespace
-
-int main() {
-    try {
-        test_zero_vector_roundtrip();
-        test_gaussian_roundtrip_quality();
-        std::cout << "PASS: turboquant standalone roundtrip tests\n";
-        return 0;
-    } catch (const std::exception & exc) {
-        std::cerr << "FAIL: " << exc.what() << '\n';
-        return 1;
-    }
-}