diff --git a/nexus/mnemosyne/archive.py b/nexus/mnemosyne/archive.py index 6a13b207..e6e27944 100644 --- a/nexus/mnemosyne/archive.py +++ b/nexus/mnemosyne/archive.py @@ -7,10 +7,11 @@ and provides query interfaces for retrieving connected knowledge. from __future__ import annotations import json +from datetime import datetime, timezone from pathlib import Path from typing import Optional -from nexus.mnemosyne.entry import ArchiveEntry +from nexus.mnemosyne.entry import ArchiveEntry, _compute_content_hash from nexus.mnemosyne.linker import HolographicLinker _EXPORT_VERSION = "1" @@ -49,14 +50,83 @@ class MnemosyneArchive: with open(self.path, "w") as f: json.dump(data, f, indent=2) + def find_duplicate(self, entry: ArchiveEntry) -> Optional[ArchiveEntry]: + """Return an existing entry with the same content hash, or None.""" + for existing in self._entries.values(): + if existing.content_hash == entry.content_hash and existing.id != entry.id: + return existing + return None + def add(self, entry: ArchiveEntry, auto_link: bool = True) -> ArchiveEntry: - """Add an entry to the archive. Auto-links to related entries.""" + """Add an entry to the archive. Auto-links to related entries. + + If an entry with the same content hash already exists, returns the + existing entry without creating a duplicate. + """ + duplicate = self.find_duplicate(entry) + if duplicate is not None: + return duplicate self._entries[entry.id] = entry if auto_link: self.linker.apply_links(entry, list(self._entries.values())) self._save() return entry + def update_entry( + self, + entry_id: str, + title: Optional[str] = None, + content: Optional[str] = None, + metadata: Optional[dict] = None, + auto_link: bool = True, + ) -> ArchiveEntry: + """Update title, content, and/or metadata on an existing entry. + + Bumps ``updated_at`` and re-runs auto-linking when content changes. + + Args: + entry_id: ID of the entry to update. + title: New title, or None to leave unchanged. + content: New content, or None to leave unchanged. + metadata: Dict to merge into existing metadata (replaces keys present). + auto_link: If True, re-run holographic linker after content change. + + Returns: + The updated ArchiveEntry. + + Raises: + KeyError: If entry_id does not exist. + """ + entry = self._entries.get(entry_id) + if entry is None: + raise KeyError(entry_id) + + content_changed = False + if title is not None and title != entry.title: + entry.title = title + content_changed = True + if content is not None and content != entry.content: + entry.content = content + content_changed = True + if metadata is not None: + entry.metadata.update(metadata) + + if content_changed: + entry.content_hash = _compute_content_hash(entry.title, entry.content) + + entry.updated_at = datetime.now(timezone.utc).isoformat() + + if content_changed and auto_link: + # Clear old links from this entry and re-run linker + for other in self._entries.values(): + if entry_id in other.links: + other.links.remove(entry_id) + entry.links = [] + self.linker.apply_links(entry, list(self._entries.values())) + + self._save() + return entry + def get(self, entry_id: str) -> Optional[ArchiveEntry]: return self._entries.get(entry_id) diff --git a/nexus/mnemosyne/entry.py b/nexus/mnemosyne/entry.py index a9e4be40..27def478 100644 --- a/nexus/mnemosyne/entry.py +++ b/nexus/mnemosyne/entry.py @@ -6,12 +6,19 @@ with metadata, content, and links to related entries. from __future__ import annotations +import hashlib from dataclasses import dataclass, field from datetime import datetime, timezone from typing import Optional import uuid +def _compute_content_hash(title: str, content: str) -> str: + """Compute SHA-256 of title+content for deduplication.""" + raw = f"{title}\x00{content}".encode("utf-8") + return hashlib.sha256(raw).hexdigest() + + @dataclass class ArchiveEntry: """A single node in the Mnemosyne holographic archive.""" @@ -24,7 +31,13 @@ class ArchiveEntry: topics: list[str] = field(default_factory=list) metadata: dict = field(default_factory=dict) created_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat()) + updated_at: Optional[str] = None # Set on mutation; None means same as created_at links: list[str] = field(default_factory=list) # IDs of related entries + content_hash: Optional[str] = None # SHA-256 of title+content for dedup + + def __post_init__(self): + if self.content_hash is None: + self.content_hash = _compute_content_hash(self.title, self.content) def to_dict(self) -> dict: return { @@ -36,7 +49,9 @@ class ArchiveEntry: "topics": self.topics, "metadata": self.metadata, "created_at": self.created_at, + "updated_at": self.updated_at, "links": self.links, + "content_hash": self.content_hash, } @classmethod diff --git a/nexus/mnemosyne/tests/test_archive.py b/nexus/mnemosyne/tests/test_archive.py index 4a4ab610..b6477d84 100644 --- a/nexus/mnemosyne/tests/test_archive.py +++ b/nexus/mnemosyne/tests/test_archive.py @@ -491,3 +491,178 @@ def test_tag_persistence_across_reload(): fresh = a2.get(e.id) assert "beta" in fresh.topics assert "alpha" not in fresh.topics + + +# --- content_hash and updated_at field tests --- + +def test_entry_has_content_hash(): + e = ArchiveEntry(title="Hello", content="world") + assert e.content_hash is not None + assert len(e.content_hash) == 64 # SHA-256 hex + + +def test_entry_content_hash_deterministic(): + e1 = ArchiveEntry(title="Hello", content="world") + e2 = ArchiveEntry(title="Hello", content="world") + assert e1.content_hash == e2.content_hash + + +def test_entry_content_hash_differs_on_different_content(): + e1 = ArchiveEntry(title="Hello", content="world") + e2 = ArchiveEntry(title="Hello", content="different") + assert e1.content_hash != e2.content_hash + + +def test_entry_updated_at_defaults_none(): + e = ArchiveEntry(title="T", content="c") + assert e.updated_at is None + + +def test_entry_roundtrip_includes_new_fields(): + e = ArchiveEntry(title="T", content="c") + d = e.to_dict() + assert "content_hash" in d + assert "updated_at" in d + e2 = ArchiveEntry.from_dict(d) + assert e2.content_hash == e.content_hash + assert e2.updated_at == e.updated_at + + +# --- content deduplication tests --- + +def test_add_deduplication_same_content(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + e1 = ingest_event(archive, title="Dup", content="Same content here") + e2 = ingest_event(archive, title="Dup", content="Same content here") + # Should NOT have created a second entry + assert archive.count == 1 + assert e1.id == e2.id + + +def test_add_deduplication_different_content(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + ingest_event(archive, title="A", content="Content one") + ingest_event(archive, title="B", content="Content two") + assert archive.count == 2 + + +def test_find_duplicate_returns_existing(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + e1 = ingest_event(archive, title="Dup", content="Same content here") + probe = ArchiveEntry(title="Dup", content="Same content here") + dup = archive.find_duplicate(probe) + assert dup is not None + assert dup.id == e1.id + + +def test_find_duplicate_returns_none_for_unique(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + ingest_event(archive, title="A", content="Some content") + probe = ArchiveEntry(title="B", content="Totally different content") + assert archive.find_duplicate(probe) is None + + +def test_find_duplicate_empty_archive(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + probe = ArchiveEntry(title="X", content="y") + assert archive.find_duplicate(probe) is None + + +# --- update_entry tests --- + +def test_update_entry_title(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + e = ingest_event(archive, title="Old title", content="Some content") + archive.update_entry(e.id, title="New title") + fresh = archive.get(e.id) + assert fresh.title == "New title" + assert fresh.content == "Some content" + + +def test_update_entry_content(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + e = ingest_event(archive, title="T", content="Old content") + archive.update_entry(e.id, content="New content") + fresh = archive.get(e.id) + assert fresh.content == "New content" + + +def test_update_entry_metadata(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + e = ingest_event(archive, title="T", content="c") + archive.update_entry(e.id, metadata={"key": "value"}) + fresh = archive.get(e.id) + assert fresh.metadata["key"] == "value" + + +def test_update_entry_bumps_updated_at(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + e = ingest_event(archive, title="T", content="c") + assert e.updated_at is None + archive.update_entry(e.id, title="Updated") + fresh = archive.get(e.id) + assert fresh.updated_at is not None + + +def test_update_entry_refreshes_content_hash(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + e = ingest_event(archive, title="T", content="Original content") + old_hash = e.content_hash + archive.update_entry(e.id, content="Completely new content") + fresh = archive.get(e.id) + assert fresh.content_hash != old_hash + + +def test_update_entry_missing_raises(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + try: + archive.update_entry("nonexistent-id", title="X") + assert False, "Expected KeyError" + except KeyError: + pass + + +def test_update_entry_persists_across_reload(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + a1 = MnemosyneArchive(archive_path=path) + e = ingest_event(a1, title="Before", content="Before content") + a1.update_entry(e.id, title="After", content="After content") + + a2 = MnemosyneArchive(archive_path=path) + fresh = a2.get(e.id) + assert fresh.title == "After" + assert fresh.content == "After content" + assert fresh.updated_at is not None + + +def test_update_entry_no_change_no_crash(): + """Calling update_entry with all None args should not fail.""" + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "test_archive.json" + archive = MnemosyneArchive(archive_path=path) + e = ingest_event(archive, title="T", content="c") + result = archive.update_entry(e.id) + assert result.title == "T"