248 lines
8.1 KiB
Python
248 lines
8.1 KiB
Python
"""Tests for timmy.semantic_memory — semantic search, chunking, indexing."""
|
|
|
|
import pytest
|
|
from pathlib import Path
|
|
from unittest.mock import patch
|
|
|
|
from timmy.semantic_memory import (
|
|
_simple_hash_embedding,
|
|
embed_text,
|
|
cosine_similarity,
|
|
SemanticMemory,
|
|
MemorySearcher,
|
|
MemoryChunk,
|
|
memory_search,
|
|
_get_embedding_model,
|
|
)
|
|
|
|
|
|
class TestSimpleHashEmbedding:
|
|
"""Test the fallback hash-based embedding."""
|
|
|
|
def test_returns_list_of_floats(self):
|
|
vec = _simple_hash_embedding("hello world")
|
|
assert isinstance(vec, list)
|
|
assert len(vec) == 128
|
|
assert all(isinstance(x, float) for x in vec)
|
|
|
|
def test_deterministic(self):
|
|
a = _simple_hash_embedding("same text")
|
|
b = _simple_hash_embedding("same text")
|
|
assert a == b
|
|
|
|
def test_different_texts_differ(self):
|
|
a = _simple_hash_embedding("hello world")
|
|
b = _simple_hash_embedding("goodbye universe")
|
|
assert a != b
|
|
|
|
def test_normalized(self):
|
|
import math
|
|
vec = _simple_hash_embedding("test normalization")
|
|
magnitude = math.sqrt(sum(x * x for x in vec))
|
|
assert abs(magnitude - 1.0) < 0.01
|
|
|
|
|
|
class TestEmbedText:
|
|
"""Test embed_text with fallback."""
|
|
|
|
def test_returns_embedding(self):
|
|
# TIMMY_SKIP_EMBEDDINGS=1 in conftest, so uses fallback
|
|
vec = embed_text("test text")
|
|
assert isinstance(vec, list)
|
|
assert len(vec) > 0
|
|
|
|
|
|
class TestCosineSimilarity:
|
|
"""Test cosine_similarity function."""
|
|
|
|
def test_identical_vectors(self):
|
|
v = [1.0, 0.0, 0.0]
|
|
assert cosine_similarity(v, v) == pytest.approx(1.0)
|
|
|
|
def test_orthogonal_vectors(self):
|
|
a = [1.0, 0.0]
|
|
b = [0.0, 1.0]
|
|
assert cosine_similarity(a, b) == pytest.approx(0.0)
|
|
|
|
def test_opposite_vectors(self):
|
|
a = [1.0, 0.0]
|
|
b = [-1.0, 0.0]
|
|
assert cosine_similarity(a, b) == pytest.approx(-1.0)
|
|
|
|
def test_zero_vector(self):
|
|
a = [0.0, 0.0]
|
|
b = [1.0, 0.0]
|
|
assert cosine_similarity(a, b) == 0.0
|
|
|
|
|
|
class TestSemanticMemory:
|
|
"""Test SemanticMemory class."""
|
|
|
|
@pytest.fixture
|
|
def mem(self, tmp_path):
|
|
sm = SemanticMemory()
|
|
sm.db_path = tmp_path / "test_semantic.db"
|
|
sm.vault_path = tmp_path / "vault"
|
|
sm.vault_path.mkdir()
|
|
sm._init_db()
|
|
return sm
|
|
|
|
def test_init_creates_db(self, mem):
|
|
assert mem.db_path.exists()
|
|
|
|
def test_split_into_chunks_short(self, mem):
|
|
text = "Short paragraph."
|
|
chunks = mem._split_into_chunks(text)
|
|
assert len(chunks) == 1
|
|
assert chunks[0] == "Short paragraph."
|
|
|
|
def test_split_into_chunks_multiple_paragraphs(self, mem):
|
|
text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph."
|
|
chunks = mem._split_into_chunks(text)
|
|
assert len(chunks) == 3
|
|
|
|
def test_split_into_chunks_long_paragraph(self, mem):
|
|
text = ". ".join([f"Sentence {i}" for i in range(50)])
|
|
chunks = mem._split_into_chunks(text, max_chunk_size=100)
|
|
assert len(chunks) > 1
|
|
|
|
def test_split_empty_text(self, mem):
|
|
assert mem._split_into_chunks("") == []
|
|
|
|
def test_index_file(self, mem):
|
|
md_file = mem.vault_path / "test.md"
|
|
md_file.write_text("# Title\n\nThis is a test document with enough content to index properly.\n\nAnother paragraph with more content here.")
|
|
count = mem.index_file(md_file)
|
|
assert count > 0
|
|
|
|
def test_index_nonexistent_file(self, mem):
|
|
count = mem.index_file(Path("/nonexistent/file.md"))
|
|
assert count == 0
|
|
|
|
def test_index_file_skips_already_indexed(self, mem):
|
|
md_file = mem.vault_path / "cached.md"
|
|
md_file.write_text("# Cached\n\nContent that should only be indexed once if unchanged.")
|
|
count1 = mem.index_file(md_file)
|
|
count2 = mem.index_file(md_file)
|
|
assert count1 > 0
|
|
assert count2 == 0 # Already indexed, same hash
|
|
|
|
def test_index_vault(self, mem):
|
|
(mem.vault_path / "a.md").write_text("# File A\n\nContent of file A with some meaningful text here.")
|
|
(mem.vault_path / "b.md").write_text("# File B\n\nContent of file B with different meaningful text.")
|
|
total = mem.index_vault()
|
|
assert total >= 2
|
|
|
|
def test_index_vault_skips_handoff(self, mem):
|
|
"""Verify handoff files are excluded from indexing."""
|
|
handoff = mem.vault_path / "last-session-handoff.md"
|
|
handoff.write_text("# Handoff\n\nThis should be skipped completely from indexing.")
|
|
real = mem.vault_path / "real.md"
|
|
real.write_text("# Real\n\nThis should be indexed with enough meaningful content.")
|
|
|
|
# index_file on the handoff file should NOT skip it
|
|
# (that's only index_vault logic), so test the vault logic directly
|
|
count = mem.index_file(handoff)
|
|
assert count > 0 # index_file indexes everything
|
|
|
|
# Wipe and re-test via index_vault
|
|
import sqlite3
|
|
conn = sqlite3.connect(str(mem.db_path))
|
|
conn.execute("DELETE FROM chunks")
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
mem.index_vault()
|
|
conn = sqlite3.connect(str(mem.db_path))
|
|
rows = conn.execute("SELECT DISTINCT source FROM chunks").fetchall()
|
|
conn.close()
|
|
sources = [r[0] for r in rows]
|
|
# Only the real file should be indexed, not the handoff
|
|
assert any("real" in s for s in sources)
|
|
assert not any("last-session-handoff" in s for s in sources)
|
|
|
|
def test_search_returns_results(self, mem):
|
|
md = mem.vault_path / "searchable.md"
|
|
md.write_text("# Python\n\nPython is a programming language used for web development and data science.")
|
|
mem.index_file(md)
|
|
|
|
results = mem.search("programming language")
|
|
assert len(results) > 0
|
|
# Each result is (content, score)
|
|
assert isinstance(results[0], tuple)
|
|
assert len(results[0]) == 2
|
|
|
|
def test_search_empty_db(self, mem):
|
|
results = mem.search("anything")
|
|
assert results == []
|
|
|
|
def test_get_relevant_context(self, mem):
|
|
md = mem.vault_path / "context.md"
|
|
md.write_text("# Important\n\nThis is very important information about the system architecture.")
|
|
mem.index_file(md)
|
|
|
|
ctx = mem.get_relevant_context("architecture")
|
|
# May or may not match depending on hash-based similarity
|
|
assert isinstance(ctx, str)
|
|
|
|
def test_get_relevant_context_empty(self, mem):
|
|
assert mem.get_relevant_context("anything") == ""
|
|
|
|
def test_stats(self, mem):
|
|
stats = mem.stats()
|
|
assert "total_chunks" in stats
|
|
assert "total_files" in stats
|
|
assert stats["total_chunks"] == 0
|
|
|
|
|
|
class TestMemorySearcher:
|
|
"""Test MemorySearcher high-level interface."""
|
|
|
|
@pytest.fixture
|
|
def searcher(self, tmp_path):
|
|
ms = MemorySearcher()
|
|
ms.semantic.db_path = tmp_path / "searcher.db"
|
|
ms.semantic.vault_path = tmp_path / "vault"
|
|
ms.semantic.vault_path.mkdir()
|
|
ms.semantic._init_db()
|
|
return ms
|
|
|
|
def test_search_semantic_tier(self, searcher):
|
|
results = searcher.search("test query", tiers=["semantic"])
|
|
assert "semantic" in results
|
|
|
|
def test_search_defaults_to_semantic(self, searcher):
|
|
results = searcher.search("test")
|
|
assert "semantic" in results
|
|
|
|
def test_get_context_for_query_empty(self, searcher):
|
|
ctx = searcher.get_context_for_query("test")
|
|
assert ctx == "" # Empty DB
|
|
|
|
|
|
class TestMemorySearch:
|
|
"""Test module-level memory_search function."""
|
|
|
|
def test_no_results(self):
|
|
result = memory_search("something obscure that won't match anything")
|
|
assert isinstance(result, str)
|
|
|
|
def test_none_top_k_handled(self):
|
|
result = memory_search("test", top_k=None)
|
|
assert isinstance(result, str)
|
|
|
|
|
|
class TestMemoryChunk:
|
|
"""Test MemoryChunk dataclass."""
|
|
|
|
def test_create(self):
|
|
chunk = MemoryChunk(
|
|
id="c1",
|
|
source="/path/to/file.md",
|
|
content="chunk text",
|
|
embedding=[0.1, 0.2],
|
|
created_at="2026-03-06",
|
|
)
|
|
assert chunk.id == "c1"
|
|
assert chunk.content == "chunk text"
|