Timmy-time-dashboard/tests/timmy/test_semantic_memory.py

"""Tests for timmy.semantic_memory — semantic search, chunking, indexing."""

import pytest
from pathlib import Path
from unittest.mock import patch

from timmy.semantic_memory import (
    _simple_hash_embedding,
    embed_text,
    cosine_similarity,
    SemanticMemory,
    MemorySearcher,
    MemoryChunk,
    memory_search,
    _get_embedding_model,
)


class TestSimpleHashEmbedding:
    """Test the fallback hash-based embedding."""

    def test_returns_list_of_floats(self):
        vec = _simple_hash_embedding("hello world")
        assert isinstance(vec, list)
        assert len(vec) == 128
        assert all(isinstance(x, float) for x in vec)

    def test_deterministic(self):
        a = _simple_hash_embedding("same text")
        b = _simple_hash_embedding("same text")
        assert a == b

    def test_different_texts_differ(self):
        a = _simple_hash_embedding("hello world")
        b = _simple_hash_embedding("goodbye universe")
        assert a != b

    def test_normalized(self):
        import math
        vec = _simple_hash_embedding("test normalization")
        magnitude = math.sqrt(sum(x * x for x in vec))
        assert abs(magnitude - 1.0) < 0.01


class TestEmbedText:
    """Test embed_text with fallback."""

    def test_returns_embedding(self):
        # TIMMY_SKIP_EMBEDDINGS=1 in conftest, so uses fallback
        vec = embed_text("test text")
        assert isinstance(vec, list)
        assert len(vec) > 0


class TestCosineSimilarity:
    """Test cosine_similarity function."""

    def test_identical_vectors(self):
        v = [1.0, 0.0, 0.0]
        assert cosine_similarity(v, v) == pytest.approx(1.0)

    def test_orthogonal_vectors(self):
        a = [1.0, 0.0]
        b = [0.0, 1.0]
        assert cosine_similarity(a, b) == pytest.approx(0.0)

    def test_opposite_vectors(self):
        a = [1.0, 0.0]
        b = [-1.0, 0.0]
        assert cosine_similarity(a, b) == pytest.approx(-1.0)

    def test_zero_vector(self):
        a = [0.0, 0.0]
        b = [1.0, 0.0]
        assert cosine_similarity(a, b) == 0.0


class TestSemanticMemory:
    """Test SemanticMemory class."""

    @pytest.fixture
    def mem(self, tmp_path):
        sm = SemanticMemory()
        sm.db_path = tmp_path / "test_semantic.db"
        sm.vault_path = tmp_path / "vault"
        sm.vault_path.mkdir()
        sm._init_db()
        return sm

    def test_init_creates_db(self, mem):
        assert mem.db_path.exists()

    def test_split_into_chunks_short(self, mem):
        text = "Short paragraph."
        chunks = mem._split_into_chunks(text)
        assert len(chunks) == 1
        assert chunks[0] == "Short paragraph."

    def test_split_into_chunks_multiple_paragraphs(self, mem):
        text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph."
        chunks = mem._split_into_chunks(text)
        assert len(chunks) == 3

    def test_split_into_chunks_long_paragraph(self, mem):
        text = ". ".join([f"Sentence {i}" for i in range(50)])
        chunks = mem._split_into_chunks(text, max_chunk_size=100)
        assert len(chunks) > 1

    def test_split_empty_text(self, mem):
        assert mem._split_into_chunks("") == []

    def test_index_file(self, mem):
        md_file = mem.vault_path / "test.md"
        md_file.write_text("# Title\n\nThis is a test document with enough content to index properly.\n\nAnother paragraph with more content here.")
        count = mem.index_file(md_file)
        assert count > 0

    def test_index_nonexistent_file(self, mem):
        count = mem.index_file(Path("/nonexistent/file.md"))
        assert count == 0

    def test_index_file_skips_already_indexed(self, mem):
        md_file = mem.vault_path / "cached.md"
        md_file.write_text("# Cached\n\nContent that should only be indexed once if unchanged.")
        count1 = mem.index_file(md_file)
        count2 = mem.index_file(md_file)
        assert count1 > 0
        assert count2 == 0  # Already indexed, same hash

    def test_index_vault(self, mem):
        (mem.vault_path / "a.md").write_text("# File A\n\nContent of file A with some meaningful text here.")
        (mem.vault_path / "b.md").write_text("# File B\n\nContent of file B with different meaningful text.")
        total = mem.index_vault()
        assert total >= 2

    def test_index_vault_skips_handoff(self, mem):
        """Verify handoff files are excluded from indexing."""
        handoff = mem.vault_path / "last-session-handoff.md"
        handoff.write_text("# Handoff\n\nThis should be skipped completely from indexing.")
        real = mem.vault_path / "real.md"
        real.write_text("# Real\n\nThis should be indexed with enough meaningful content.")

        # index_file on the handoff file should NOT skip it
        # (that's only index_vault logic), so test the vault logic directly
        count = mem.index_file(handoff)
        assert count > 0  # index_file indexes everything

        # Wipe and re-test via index_vault
        import sqlite3
        conn = sqlite3.connect(str(mem.db_path))
        conn.execute("DELETE FROM chunks")
        conn.commit()
        conn.close()

        mem.index_vault()
        conn = sqlite3.connect(str(mem.db_path))
        rows = conn.execute("SELECT DISTINCT source FROM chunks").fetchall()
        conn.close()
        sources = [r[0] for r in rows]
        # Only the real file should be indexed, not the handoff
        assert any("real" in s for s in sources)
        assert not any("last-session-handoff" in s for s in sources)

    def test_search_returns_results(self, mem):
        md = mem.vault_path / "searchable.md"
        md.write_text("# Python\n\nPython is a programming language used for web development and data science.")
        mem.index_file(md)

        results = mem.search("programming language")
        assert len(results) > 0
        # Each result is (content, score)
        assert isinstance(results[0], tuple)
        assert len(results[0]) == 2

    def test_search_empty_db(self, mem):
        results = mem.search("anything")
        assert results == []

    def test_get_relevant_context(self, mem):
        md = mem.vault_path / "context.md"
        md.write_text("# Important\n\nThis is very important information about the system architecture.")
        mem.index_file(md)

        ctx = mem.get_relevant_context("architecture")
        # May or may not match depending on hash-based similarity
        assert isinstance(ctx, str)

    def test_get_relevant_context_empty(self, mem):
        assert mem.get_relevant_context("anything") == ""

    def test_stats(self, mem):
        stats = mem.stats()
        assert "total_chunks" in stats
        assert "total_files" in stats
        assert stats["total_chunks"] == 0


class TestMemorySearcher:
    """Test MemorySearcher high-level interface."""

    @pytest.fixture
    def searcher(self, tmp_path):
        ms = MemorySearcher()
        ms.semantic.db_path = tmp_path / "searcher.db"
        ms.semantic.vault_path = tmp_path / "vault"
        ms.semantic.vault_path.mkdir()
        ms.semantic._init_db()
        return ms

    def test_search_semantic_tier(self, searcher):
        results = searcher.search("test query", tiers=["semantic"])
        assert "semantic" in results

    def test_search_defaults_to_semantic(self, searcher):
        results = searcher.search("test")
        assert "semantic" in results

    def test_get_context_for_query_empty(self, searcher):
        ctx = searcher.get_context_for_query("test")
        assert ctx == ""  # Empty DB


class TestMemorySearch:
    """Test module-level memory_search function."""

    def test_no_results(self):
        result = memory_search("something obscure that won't match anything")
        assert isinstance(result, str)

    def test_none_top_k_handled(self):
        result = memory_search("test", top_k=None)
        assert isinstance(result, str)


class TestMemoryChunk:
    """Test MemoryChunk dataclass."""

    def test_create(self):
        chunk = MemoryChunk(
            id="c1",
            source="/path/to/file.md",
            content="chunk text",
            embedding=[0.1, 0.2],
            created_at="2026-03-06",
        )
        assert chunk.id == "c1"
        assert chunk.content == "chunk text"