"""Tests for the embedding backend module.""" from __future__ import annotations import math import pytest from nexus.mnemosyne.embeddings import ( EmbeddingBackend, TfidfEmbeddingBackend, cosine_similarity, get_embedding_backend, ) class TestCosineSimilarity: def test_identical_vectors(self): a = [1.0, 2.0, 3.0] assert abs(cosine_similarity(a, a) - 1.0) < 1e-9 def test_orthogonal_vectors(self): a = [1.0, 0.0] b = [0.0, 1.0] assert abs(cosine_similarity(a, b) - 0.0) < 1e-9 def test_opposite_vectors(self): a = [1.0, 0.0] b = [-1.0, 0.0] assert abs(cosine_similarity(a, b) - (-1.0)) < 1e-9 def test_zero_vector(self): a = [0.0, 0.0] b = [1.0, 2.0] assert cosine_similarity(a, b) == 0.0 def test_dimension_mismatch(self): with pytest.raises(ValueError): cosine_similarity([1.0], [1.0, 2.0]) class TestTfidfEmbeddingBackend: def test_basic_embed(self): backend = TfidfEmbeddingBackend() vec = backend.embed("hello world test") assert len(vec) > 0 assert all(isinstance(v, float) for v in vec) def test_empty_text(self): backend = TfidfEmbeddingBackend() vec = backend.embed("") assert vec == [] def test_identical_texts_similar(self): backend = TfidfEmbeddingBackend() v1 = backend.embed("the cat sat on the mat") v2 = backend.embed("the cat sat on the mat") sim = backend.similarity(v1, v2) assert sim > 0.99 def test_different_texts_less_similar(self): backend = TfidfEmbeddingBackend() v1 = backend.embed("python programming language") v2 = backend.embed("cooking recipes italian food") sim = backend.similarity(v1, v2) assert sim < 0.5 def test_related_texts_more_similar(self): backend = TfidfEmbeddingBackend() v1 = backend.embed("machine learning neural networks") v2 = backend.embed("deep learning artificial neural nets") v3 = backend.embed("baking bread sourdough recipe") sim_related = backend.similarity(v1, v2) sim_unrelated = backend.similarity(v1, v3) assert sim_related > sim_unrelated def test_name(self): backend = TfidfEmbeddingBackend() assert "TF-IDF" in backend.name def test_dimension_grows(self): backend = TfidfEmbeddingBackend() d1 = backend.dimension backend.embed("new unique tokens here") d2 = backend.dimension assert d2 > d1 def test_padding_different_lengths(self): backend = TfidfEmbeddingBackend() v1 = backend.embed("short") v2 = backend.embed("this is a much longer text with many more tokens") # Should not raise despite different lengths sim = backend.similarity(v1, v2) assert 0.0 <= sim <= 1.0 class TestGetEmbeddingBackend: def test_tfidf_preferred(self): backend = get_embedding_backend(prefer="tfidf") assert isinstance(backend, TfidfEmbeddingBackend) def test_auto_returns_something(self): backend = get_embedding_backend() assert isinstance(backend, EmbeddingBackend) def test_ollama_unavailable_falls_back(self): # Should fall back to TF-IDF when Ollama is unreachable backend = get_embedding_backend(prefer="ollama", ollama_url="http://localhost:1") # If it raises, the test fails — it should fall back # But with prefer="ollama" it raises if unavailable # So we test without prefer: backend = get_embedding_backend(ollama_url="http://localhost:1") assert isinstance(backend, TfidfEmbeddingBackend)