From cc4ead999adbde8fa064ffa2f53b715a4c8e8e72 Mon Sep 17 00:00:00 2001 From: teyrebaz33 Date: Sun, 8 Mar 2026 19:57:51 +0300 Subject: [PATCH] =?UTF-8?q?feat:=20configurable=20embedding=20infrastructu?= =?UTF-8?q?re=20=E2=80=94=20local=20(fastembed)=20+=20API=20(OpenAI)=20(#6?= =?UTF-8?q?75)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add agent/embeddings.py with Embedder protocol, FastEmbedEmbedder, OpenAIEmbedder - Factory function get_embedder() reads provider from config.yaml embeddings section - Lazy initialization — no startup impact, model loaded on first embed call - cosine_similarity() and cosine_similarity_matrix() utility functions included - Add fastembed as optional dependency in pyproject.toml - 30 unit tests, all passing Closes #675 --- agent/embeddings.py | 219 +++++++++++++++++++++++++++++++++++++++ pyproject.toml | 2 + tests/test_embeddings.py | 212 +++++++++++++++++++++++++++++++++++++ 3 files changed, 433 insertions(+) create mode 100644 agent/embeddings.py create mode 100644 tests/test_embeddings.py diff --git a/agent/embeddings.py b/agent/embeddings.py new file mode 100644 index 000000000..8e0f0696a --- /dev/null +++ b/agent/embeddings.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 +""" +Embedding Infrastructure — Configurable local (fastembed) + API (OpenAI) embedders. + +Provides a shared embedding capability for cognitive memory recall (#509), +semantic codebase search (#489), and future similarity-based operations. + +Usage: + embedder = get_embedder(config) + vector = embedder.embed_text("some text") + vectors = embedder.embed_texts(["text1", "text2"]) + +Config (config.yaml): + embeddings: + provider: "local" # "local" or "openai" + model: "all-MiniLM-L6-v2" # for local + # model: "text-embedding-3-small" # for openai +""" +from __future__ import annotations + +import logging +import math +from typing import Protocol, runtime_checkable + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Protocol (interface) +# --------------------------------------------------------------------------- + +@runtime_checkable +class Embedder(Protocol): + def embed_text(self, text: str) -> list[float]: ... + def embed_texts(self, texts: list[str]) -> list[list[float]]: ... + + @property + def dimensions(self) -> int: ... + + +# --------------------------------------------------------------------------- +# Local embedder (fastembed) +# --------------------------------------------------------------------------- + +class FastEmbedEmbedder: + """Local embeddings via fastembed (all-MiniLM-L6-v2, 384 dims). + + ~100MB model downloaded on first use to ~/.cache/fastembed/. + No API key needed, private, fast (~5ms per embed). + Requires: pip install fastembed + """ + + DEFAULT_MODEL = "sentence-transformers/all-MiniLM-L6-v2" + + def __init__(self, model: str = DEFAULT_MODEL): + self.model_name = model + self._model = None # Lazy initialization + + def _load(self): + if self._model is not None: + return + try: + from fastembed import TextEmbedding + except ImportError: + raise ImportError( + "fastembed is not installed. " + "Install it with: pip install fastembed\n" + "Or: pip install 'hermes-agent[embeddings]'" + ) + logger.info("Loading fastembed model '%s' (first use may download ~100MB)...", self.model_name) + self._model = TextEmbedding(model_name=self.model_name) + logger.info("fastembed model loaded.") + + def embed_text(self, text: str) -> list[float]: + self._load() + results = list(self._model.embed([text])) + return results[0].tolist() + + def embed_texts(self, texts: list[str]) -> list[list[float]]: + self._load() + results = list(self._model.embed(texts)) + return [r.tolist() for r in results] + + @property + def dimensions(self) -> int: + return 384 # all-MiniLM-L6-v2 fixed dims + + +# --------------------------------------------------------------------------- +# OpenAI embedder +# --------------------------------------------------------------------------- + +class OpenAIEmbedder: + """API embeddings via OpenAI (text-embedding-3-small, 1536 dims). + + Uses existing OpenAI client from config. + Higher quality but costs ~$0.02/1M tokens. + Requires: openai (already a dependency) + """ + + DEFAULT_MODEL = "text-embedding-3-small" + _DIMENSIONS = { + "text-embedding-3-small": 1536, + "text-embedding-3-large": 3072, + "text-embedding-ada-002": 1536, + } + + def __init__(self, model: str = DEFAULT_MODEL, api_key: str = None, base_url: str = None): + self.model_name = model + self._api_key = api_key + self._base_url = base_url + self._client = None # Lazy initialization + + def _load(self): + if self._client is not None: + return + try: + from openai import OpenAI + except ImportError: + raise ImportError("openai package is not installed.") + kwargs = {} + if self._api_key: + kwargs["api_key"] = self._api_key + if self._base_url: + kwargs["base_url"] = self._base_url + self._client = OpenAI(**kwargs) + + def embed_text(self, text: str) -> list[float]: + self._load() + response = self._client.embeddings.create(input=[text], model=self.model_name) + return response.data[0].embedding + + def embed_texts(self, texts: list[str]) -> list[list[float]]: + self._load() + response = self._client.embeddings.create(input=texts, model=self.model_name) + return [item.embedding for item in response.data] + + @property + def dimensions(self) -> int: + return self._DIMENSIONS.get(self.model_name, 1536) + + +# --------------------------------------------------------------------------- +# Factory +# --------------------------------------------------------------------------- + +def get_embedder(config: dict) -> Embedder: + """Factory: returns configured embedder based on config dict. + + Args: + config: Full config dict. Reads from config["embeddings"] section. + + Returns: + An Embedder instance. + + Raises: + ValueError: If provider is unknown. + ImportError: If required package is not installed. + """ + emb_config = config.get("embeddings", {}) + provider = emb_config.get("provider", "local") + model = emb_config.get("model") + + if provider == "local": + effective_model = model or FastEmbedEmbedder.DEFAULT_MODEL + return FastEmbedEmbedder(model=effective_model) + + elif provider == "openai": + effective_model = model or OpenAIEmbedder.DEFAULT_MODEL + api_key = emb_config.get("api_key") + base_url = emb_config.get("base_url") + return OpenAIEmbedder(model=effective_model, api_key=api_key, base_url=base_url) + + else: + raise ValueError( + f"Unknown embedding provider '{provider}'. " + "Supported providers: 'local', 'openai'" + ) + + +# --------------------------------------------------------------------------- +# Utility functions +# --------------------------------------------------------------------------- + +def cosine_similarity(a: list[float], b: list[float]) -> float: + """Compute cosine similarity between two vectors. + + Returns a value in [-1, 1]. Higher = more similar. + Returns 0.0 if either vector has zero magnitude. + """ + if len(a) != len(b): + raise ValueError(f"Vector dimensions must match: {len(a)} != {len(b)}") + + dot = sum(x * y for x, y in zip(a, b)) + mag_a = math.sqrt(sum(x * x for x in a)) + mag_b = math.sqrt(sum(x * x for x in b)) + + if mag_a == 0.0 or mag_b == 0.0: + return 0.0 + + return dot / (mag_a * mag_b) + + +def cosine_similarity_matrix(vectors: list[list[float]]) -> list[list[float]]: + """Compute NxN pairwise cosine similarity matrix. + + Useful for deduplication: if matrix[i][j] >= 0.98, items i and j are near-duplicates. + + Returns: + NxN matrix where matrix[i][j] = cosine_similarity(vectors[i], vectors[j]) + """ + n = len(vectors) + matrix = [[0.0] * n for _ in range(n)] + for i in range(n): + matrix[i][i] = 1.0 + for j in range(i + 1, n): + sim = cosine_similarity(vectors[i], vectors[j]) + matrix[i][j] = sim + matrix[j][i] = sim + return matrix diff --git a/pyproject.toml b/pyproject.toml index 807875156..57c1c4600 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ pty = [ "pywinpty>=2.0.0; sys_platform == 'win32'", ] honcho = ["honcho-ai>=2.0.1"] +embeddings = ["fastembed>=0.3.0"] mcp = ["mcp>=1.2.0"] homeassistant = ["aiohttp>=3.9.0"] yc-bench = ["yc-bench @ git+https://github.com/collinear-ai/yc-bench.git"] @@ -65,6 +66,7 @@ all = [ "hermes-agent[slack]", "hermes-agent[pty]", "hermes-agent[honcho]", + "hermes-agent[embeddings]", "hermes-agent[mcp]", "hermes-agent[homeassistant]", ] diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py new file mode 100644 index 000000000..8c66a496c --- /dev/null +++ b/tests/test_embeddings.py @@ -0,0 +1,212 @@ +"""Tests for agent/embeddings.py — Embedder protocol, implementations, factory, utilities.""" + +import math +import pytest +from unittest.mock import MagicMock, patch + +from agent.embeddings import ( + Embedder, + FastEmbedEmbedder, + OpenAIEmbedder, + get_embedder, + cosine_similarity, + cosine_similarity_matrix, +) + + +# ========================================================================= +# cosine_similarity +# ========================================================================= + +class TestCosineSimilarity: + def test_identical_vectors(self): + a = [1.0, 0.0, 0.0] + assert cosine_similarity(a, a) == pytest.approx(1.0) + + def test_orthogonal_vectors(self): + a = [1.0, 0.0] + b = [0.0, 1.0] + assert cosine_similarity(a, b) == pytest.approx(0.0) + + def test_opposite_vectors(self): + a = [1.0, 0.0] + b = [-1.0, 0.0] + assert cosine_similarity(a, b) == pytest.approx(-1.0) + + def test_zero_vector_returns_zero(self): + a = [0.0, 0.0] + b = [1.0, 0.0] + assert cosine_similarity(a, b) == 0.0 + + def test_dimension_mismatch_raises(self): + with pytest.raises(ValueError, match="dimensions must match"): + cosine_similarity([1.0, 2.0], [1.0, 2.0, 3.0]) + + def test_similar_vectors(self): + a = [1.0, 1.0] + b = [1.0, 1.1] + sim = cosine_similarity(a, b) + assert 0.99 < sim < 1.0 + + +# ========================================================================= +# cosine_similarity_matrix +# ========================================================================= + +class TestCosineSimilarityMatrix: + def test_diagonal_is_one(self): + vecs = [[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]] + matrix = cosine_similarity_matrix(vecs) + for i in range(len(vecs)): + assert matrix[i][i] == pytest.approx(1.0) + + def test_symmetry(self): + vecs = [[1.0, 0.0], [0.5, 0.5]] + matrix = cosine_similarity_matrix(vecs) + assert matrix[0][1] == pytest.approx(matrix[1][0]) + + def test_orthogonal_off_diagonal(self): + vecs = [[1.0, 0.0], [0.0, 1.0]] + matrix = cosine_similarity_matrix(vecs) + assert matrix[0][1] == pytest.approx(0.0) + + def test_shape(self): + vecs = [[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]] + matrix = cosine_similarity_matrix(vecs) + assert len(matrix) == 3 + assert all(len(row) == 3 for row in matrix) + + +# ========================================================================= +# FastEmbedEmbedder +# ========================================================================= + +class TestFastEmbedEmbedder: + def test_default_model(self): + emb = FastEmbedEmbedder() + assert emb.model_name == FastEmbedEmbedder.DEFAULT_MODEL + + def test_custom_model(self): + emb = FastEmbedEmbedder(model="custom-model") + assert emb.model_name == "custom-model" + + def test_dimensions(self): + emb = FastEmbedEmbedder() + assert emb.dimensions == 384 + + def test_lazy_load(self): + emb = FastEmbedEmbedder() + assert emb._model is None + + def test_import_error_if_not_installed(self): + emb = FastEmbedEmbedder() + with patch.dict("sys.modules", {"fastembed": None}): + with pytest.raises(ImportError, match="fastembed is not installed"): + emb._load() + + def test_embed_text(self): + emb = FastEmbedEmbedder() + mock_model = MagicMock() + # Use a simple object with .tolist() instead of numpy array + fake_vec = MagicMock() + fake_vec.tolist.return_value = [0.1, 0.2, 0.3] + mock_model.embed.return_value = iter([fake_vec]) + emb._model = mock_model + result = emb.embed_text("hello") + assert result == pytest.approx([0.1, 0.2, 0.3]) + + def test_embed_texts(self): + emb = FastEmbedEmbedder() + mock_model = MagicMock() + fake_vec1 = MagicMock() + fake_vec1.tolist.return_value = [0.1, 0.2] + fake_vec2 = MagicMock() + fake_vec2.tolist.return_value = [0.3, 0.4] + mock_model.embed.return_value = iter([fake_vec1, fake_vec2]) + emb._model = mock_model + result = emb.embed_texts(["hello", "world"]) + assert len(result) == 2 + assert result[0] == pytest.approx([0.1, 0.2]) + assert result[1] == pytest.approx([0.3, 0.4]) + + +# ========================================================================= +# OpenAIEmbedder +# ========================================================================= + +class TestOpenAIEmbedder: + def test_default_model(self): + emb = OpenAIEmbedder() + assert emb.model_name == OpenAIEmbedder.DEFAULT_MODEL + + def test_dimensions_known_model(self): + assert OpenAIEmbedder(model="text-embedding-3-small").dimensions == 1536 + assert OpenAIEmbedder(model="text-embedding-3-large").dimensions == 3072 + + def test_dimensions_unknown_model(self): + assert OpenAIEmbedder(model="unknown-model").dimensions == 1536 + + def test_lazy_load(self): + emb = OpenAIEmbedder() + assert emb._client is None + + def test_embed_text(self): + emb = OpenAIEmbedder() + mock_client = MagicMock() + mock_client.embeddings.create.return_value.data = [ + MagicMock(embedding=[0.1, 0.2, 0.3]) + ] + emb._client = mock_client + result = emb.embed_text("hello") + assert result == [0.1, 0.2, 0.3] + mock_client.embeddings.create.assert_called_once_with( + input=["hello"], model=OpenAIEmbedder.DEFAULT_MODEL + ) + + def test_embed_texts(self): + emb = OpenAIEmbedder() + mock_client = MagicMock() + mock_client.embeddings.create.return_value.data = [ + MagicMock(embedding=[0.1, 0.2]), + MagicMock(embedding=[0.3, 0.4]), + ] + emb._client = mock_client + result = emb.embed_texts(["hello", "world"]) + assert len(result) == 2 + assert result[0] == [0.1, 0.2] + + +# ========================================================================= +# get_embedder factory +# ========================================================================= + +class TestGetEmbedder: + def test_default_returns_fastembed(self): + emb = get_embedder({}) + assert isinstance(emb, FastEmbedEmbedder) + + def test_local_provider(self): + emb = get_embedder({"embeddings": {"provider": "local"}}) + assert isinstance(emb, FastEmbedEmbedder) + + def test_local_custom_model(self): + emb = get_embedder({"embeddings": {"provider": "local", "model": "custom-model"}}) + assert isinstance(emb, FastEmbedEmbedder) + assert emb.model_name == "custom-model" + + def test_openai_provider(self): + emb = get_embedder({"embeddings": {"provider": "openai"}}) + assert isinstance(emb, OpenAIEmbedder) + + def test_openai_custom_model(self): + emb = get_embedder({"embeddings": {"provider": "openai", "model": "text-embedding-3-large"}}) + assert isinstance(emb, OpenAIEmbedder) + assert emb.model_name == "text-embedding-3-large" + + def test_unknown_provider_raises(self): + with pytest.raises(ValueError, match="Unknown embedding provider"): + get_embedder({"embeddings": {"provider": "unknown"}}) + + def test_embedder_protocol_compliance(self): + emb = get_embedder({}) + assert isinstance(emb, Embedder)