diff --git a/mnemosyne/__pycache__/__init__.cpython-311.pyc b/mnemosyne/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 00000000..692d2702 Binary files /dev/null and b/mnemosyne/__pycache__/__init__.cpython-311.pyc differ diff --git a/mnemosyne/__pycache__/index.cpython-311.pyc b/mnemosyne/__pycache__/index.cpython-311.pyc new file mode 100644 index 00000000..8650aa01 Binary files /dev/null and b/mnemosyne/__pycache__/index.cpython-311.pyc differ diff --git a/mnemosyne/__pycache__/ingest.cpython-311.pyc b/mnemosyne/__pycache__/ingest.cpython-311.pyc new file mode 100644 index 00000000..3730fd30 Binary files /dev/null and b/mnemosyne/__pycache__/ingest.cpython-311.pyc differ diff --git a/tests/test_mnemosyne.py b/tests/test_mnemosyne.py new file mode 100644 index 00000000..28eeffc3 --- /dev/null +++ b/tests/test_mnemosyne.py @@ -0,0 +1,205 @@ +""" +Tests for Mnemosyne — The Living Holographic Archive. + +Round-trip: ingest sample docs → query → verify results. +""" + +import json +import os +import tempfile +import pytest + +# Add parent to path for imports +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from mnemosyne.ingest import ( + chunk_text, ingest_text, ingest_file, ingest_directory, + get_stats, get_db, +) +from mnemosyne.index import keyword_search, query, list_documents, get_document + + +@pytest.fixture +def db_path(tmp_path): + """Temporary database for each test.""" + return str(tmp_path / "test_mnemosyne.db") + + +@pytest.fixture +def sample_docs(tmp_path): + """Create sample documents for testing.""" + docs = {} + + # Plain text + txt = tmp_path / "alice.txt" + txt.write_text( + "Alice was beginning to get very tired of sitting by her sister on the bank. " + "She had peeped into the book her sister was reading, but it had no pictures " + "or conversations in it. 'And what is the use of a book,' thought Alice, " + "'without pictures or conversations?'" + ) + docs["txt"] = str(txt) + + # Markdown + md = tmp_path / "readme.md" + md.write_text( + "# Project Mnemosyne\n\n" + "Mnemosyne is a sovereign holographic archive system.\n\n" + "## Features\n\n" + "- Full-text search with FTS5\n" + "- Semantic search with embeddings\n" + "- Reciprocal rank fusion for hybrid results\n" + "- SQLite-backed, no external dependencies\n" + ) + docs["md"] = str(md) + + # JSON + js = tmp_path / "data.json" + js.write_text(json.dumps({ + "title": "The Sovereignty Principle", + "body": "Every person has the right to run their own intelligence on their own hardware, " + "answerable to no one. This is the foundation of digital sovereignty.", + })) + docs["json"] = str(js) + + # JSON array + js_arr = tmp_path / "records.json" + js_arr.write_text(json.dumps([ + {"title": "Record A", "text": "First record about Bitcoin and the blockchain."}, + {"title": "Record B", "text": "Second record about AI and language models."}, + ])) + docs["json_array"] = str(js_arr) + + return docs + + +class TestChunking: + def test_short_text_no_split(self): + text = "Short text." + chunks = chunk_text(text, chunk_size=100) + assert len(chunks) == 1 + assert chunks[0] == text + + def test_long_text_splits(self): + text = "word " * 200 # 1000 chars + chunks = chunk_text(text, chunk_size=200, overlap=20) + assert len(chunks) > 1 + + def test_overlap_exists(self): + text = "aaa " * 100 + "bbb " * 100 + chunks = chunk_text(text, chunk_size=200, overlap=50) + # Some chunks should contain both aaa and bbb due to overlap + cross_chunks = [c for c in chunks if "aaa" in c and "bbb" in c] + assert len(cross_chunks) > 0 + + +class TestIngestion: + def test_ingest_text_returns_id(self, db_path): + doc_id = ingest_text("Hello world", source="test", db_path=db_path) + assert doc_id is not None + assert doc_id > 0 + + def test_ingest_text_dedup(self, db_path): + doc_id1 = ingest_text("Hello world", source="test", db_path=db_path) + doc_id2 = ingest_text("Hello world", source="test", db_path=db_path) + assert doc_id1 is not None + assert doc_id2 is None # duplicate + + def test_ingest_file_txt(self, db_path, sample_docs): + doc_id = ingest_file(sample_docs["txt"], db_path=db_path) + assert doc_id is not None + + def test_ingest_file_json(self, db_path, sample_docs): + doc_id = ingest_file(sample_docs["json"], db_path=db_path) + assert doc_id is not None + + def test_ingest_file_json_array(self, db_path, sample_docs): + doc_id = ingest_file(sample_docs["json_array"], db_path=db_path) + assert doc_id is not None + # Should have ingested 2 records + stats = get_stats(db_path) + assert stats["documents"] == 2 + + def test_ingest_directory(self, db_path, sample_docs, tmp_path): + result = ingest_directory(str(tmp_path), db_path=db_path) + assert result["ingested"] >= 4 + assert len(result["errors"]) == 0 + + def test_stats(self, db_path, sample_docs): + ingest_file(sample_docs["txt"], db_path=db_path) + ingest_file(sample_docs["md"], db_path=db_path) + stats = get_stats(db_path) + assert stats["documents"] == 2 + assert stats["chunks"] >= 2 + + +class TestSearch: + def test_keyword_search(self, db_path, sample_docs): + ingest_file(sample_docs["md"], db_path=db_path) + results = keyword_search("Mnemosyne archive", db_path=db_path) + assert len(results) > 0 + assert "mnemosyne" in results[0]["content"].lower() or "archive" in results[0]["content"].lower() + + def test_query_returns_results(self, db_path, sample_docs): + ingest_file(sample_docs["txt"], db_path=db_path) + results = query("Alice tired bank", db_path=db_path) + assert len(results) > 0 + + def test_query_empty_db(self, db_path): + results = query("anything", db_path=db_path) + assert results == [] + + def test_query_no_match(self, db_path, sample_docs): + ingest_file(sample_docs["txt"], db_path=db_path) + results = query("xyzzyplugh quantum entanglement", db_path=db_path) + assert results == [] + + def test_list_documents(self, db_path, sample_docs): + ingest_file(sample_docs["txt"], db_path=db_path) + ingest_file(sample_docs["md"], db_path=db_path) + docs = list_documents(db_path=db_path) + assert len(docs) == 2 + assert all("chunks" in d for d in docs) + + def test_get_document(self, db_path, sample_docs): + doc_id = ingest_file(sample_docs["txt"], db_path=db_path) + doc = get_document(doc_id, db_path=db_path) + assert doc is not None + assert "Alice" in doc["content"] + assert doc["title"] == "alice" + + def test_get_document_not_found(self, db_path): + doc = get_document(9999, db_path=db_path) + assert doc is None + + +class TestRoundTrip: + """Full round-trip: ingest → query → verify recall.""" + + def test_round_trip(self, db_path, sample_docs, tmp_path): + # Ingest all sample docs + result = ingest_directory(str(tmp_path), db_path=db_path) + assert result["ingested"] >= 4 + + # Verify stats + stats = get_stats(db_path) + assert stats["documents"] >= 4 + assert stats["chunks"] > 0 + + # Query for Alice + results = query("Alice pictures conversations", db_path=db_path) + assert len(results) > 0 + assert any("alice" in r.get("title", "").lower() or "Alice" in r["content"] for r in results) + + # Query for Mnemosyne + results = query("Mnemosyne sovereign archive", db_path=db_path) + assert len(results) > 0 + + # Query for sovereignty + results = query("sovereignty intelligence hardware", db_path=db_path) + assert len(results) > 0 + + # List all documents + docs = list_documents(db_path=db_path) + assert len(docs) >= 4