Tests cosine similarity, TF-IDF backend, auto-detection, and fallback behavior.
113 lines
3.7 KiB
Python
113 lines
3.7 KiB
Python
"""Tests for the embedding backend module."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import math
|
|
import pytest
|
|
|
|
from nexus.mnemosyne.embeddings import (
|
|
EmbeddingBackend,
|
|
TfidfEmbeddingBackend,
|
|
cosine_similarity,
|
|
get_embedding_backend,
|
|
)
|
|
|
|
|
|
class TestCosineSimilarity:
|
|
def test_identical_vectors(self):
|
|
a = [1.0, 2.0, 3.0]
|
|
assert abs(cosine_similarity(a, a) - 1.0) < 1e-9
|
|
|
|
def test_orthogonal_vectors(self):
|
|
a = [1.0, 0.0]
|
|
b = [0.0, 1.0]
|
|
assert abs(cosine_similarity(a, b) - 0.0) < 1e-9
|
|
|
|
def test_opposite_vectors(self):
|
|
a = [1.0, 0.0]
|
|
b = [-1.0, 0.0]
|
|
assert abs(cosine_similarity(a, b) - (-1.0)) < 1e-9
|
|
|
|
def test_zero_vector(self):
|
|
a = [0.0, 0.0]
|
|
b = [1.0, 2.0]
|
|
assert cosine_similarity(a, b) == 0.0
|
|
|
|
def test_dimension_mismatch(self):
|
|
with pytest.raises(ValueError):
|
|
cosine_similarity([1.0], [1.0, 2.0])
|
|
|
|
|
|
class TestTfidfEmbeddingBackend:
|
|
def test_basic_embed(self):
|
|
backend = TfidfEmbeddingBackend()
|
|
vec = backend.embed("hello world test")
|
|
assert len(vec) > 0
|
|
assert all(isinstance(v, float) for v in vec)
|
|
|
|
def test_empty_text(self):
|
|
backend = TfidfEmbeddingBackend()
|
|
vec = backend.embed("")
|
|
assert vec == []
|
|
|
|
def test_identical_texts_similar(self):
|
|
backend = TfidfEmbeddingBackend()
|
|
v1 = backend.embed("the cat sat on the mat")
|
|
v2 = backend.embed("the cat sat on the mat")
|
|
sim = backend.similarity(v1, v2)
|
|
assert sim > 0.99
|
|
|
|
def test_different_texts_less_similar(self):
|
|
backend = TfidfEmbeddingBackend()
|
|
v1 = backend.embed("python programming language")
|
|
v2 = backend.embed("cooking recipes italian food")
|
|
sim = backend.similarity(v1, v2)
|
|
assert sim < 0.5
|
|
|
|
def test_related_texts_more_similar(self):
|
|
backend = TfidfEmbeddingBackend()
|
|
v1 = backend.embed("machine learning neural networks")
|
|
v2 = backend.embed("deep learning artificial neural nets")
|
|
v3 = backend.embed("baking bread sourdough recipe")
|
|
sim_related = backend.similarity(v1, v2)
|
|
sim_unrelated = backend.similarity(v1, v3)
|
|
assert sim_related > sim_unrelated
|
|
|
|
def test_name(self):
|
|
backend = TfidfEmbeddingBackend()
|
|
assert "TF-IDF" in backend.name
|
|
|
|
def test_dimension_grows(self):
|
|
backend = TfidfEmbeddingBackend()
|
|
d1 = backend.dimension
|
|
backend.embed("new unique tokens here")
|
|
d2 = backend.dimension
|
|
assert d2 > d1
|
|
|
|
def test_padding_different_lengths(self):
|
|
backend = TfidfEmbeddingBackend()
|
|
v1 = backend.embed("short")
|
|
v2 = backend.embed("this is a much longer text with many more tokens")
|
|
# Should not raise despite different lengths
|
|
sim = backend.similarity(v1, v2)
|
|
assert 0.0 <= sim <= 1.0
|
|
|
|
|
|
class TestGetEmbeddingBackend:
|
|
def test_tfidf_preferred(self):
|
|
backend = get_embedding_backend(prefer="tfidf")
|
|
assert isinstance(backend, TfidfEmbeddingBackend)
|
|
|
|
def test_auto_returns_something(self):
|
|
backend = get_embedding_backend()
|
|
assert isinstance(backend, EmbeddingBackend)
|
|
|
|
def test_ollama_unavailable_falls_back(self):
|
|
# Should fall back to TF-IDF when Ollama is unreachable
|
|
backend = get_embedding_backend(prefer="ollama", ollama_url="http://localhost:1")
|
|
# If it raises, the test fails — it should fall back
|
|
# But with prefer="ollama" it raises if unavailable
|
|
# So we test without prefer:
|
|
backend = get_embedding_backend(ollama_url="http://localhost:1")
|
|
assert isinstance(backend, TfidfEmbeddingBackend)
|