feat(mnemosyne): add entry update + content deduplication
- Add `content_hash` field (SHA-256 of title+content) to ArchiveEntry - Add `updated_at` field to ArchiveEntry (None until first mutation) - Add `find_duplicate()` to MnemosyneArchive — returns existing entry with matching content hash, or None - Dedup check in `add()` — returns existing entry when content hash matches, preventing duplicate UUIDs from repeat ingestion - Add `update_entry(entry_id, title, content, metadata)` — updates fields, refreshes content_hash, bumps updated_at, re-runs auto-linking on change - 18 new tests covering all new behaviour Fixes #1239 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7,10 +7,11 @@ and provides query interfaces for retrieving connected knowledge.
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from nexus.mnemosyne.entry import ArchiveEntry
|
||||
from nexus.mnemosyne.entry import ArchiveEntry, _compute_content_hash
|
||||
from nexus.mnemosyne.linker import HolographicLinker
|
||||
|
||||
_EXPORT_VERSION = "1"
|
||||
@@ -49,14 +50,83 @@ class MnemosyneArchive:
|
||||
with open(self.path, "w") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
def find_duplicate(self, entry: ArchiveEntry) -> Optional[ArchiveEntry]:
|
||||
"""Return an existing entry with the same content hash, or None."""
|
||||
for existing in self._entries.values():
|
||||
if existing.content_hash == entry.content_hash and existing.id != entry.id:
|
||||
return existing
|
||||
return None
|
||||
|
||||
def add(self, entry: ArchiveEntry, auto_link: bool = True) -> ArchiveEntry:
|
||||
"""Add an entry to the archive. Auto-links to related entries."""
|
||||
"""Add an entry to the archive. Auto-links to related entries.
|
||||
|
||||
If an entry with the same content hash already exists, returns the
|
||||
existing entry without creating a duplicate.
|
||||
"""
|
||||
duplicate = self.find_duplicate(entry)
|
||||
if duplicate is not None:
|
||||
return duplicate
|
||||
self._entries[entry.id] = entry
|
||||
if auto_link:
|
||||
self.linker.apply_links(entry, list(self._entries.values()))
|
||||
self._save()
|
||||
return entry
|
||||
|
||||
def update_entry(
|
||||
self,
|
||||
entry_id: str,
|
||||
title: Optional[str] = None,
|
||||
content: Optional[str] = None,
|
||||
metadata: Optional[dict] = None,
|
||||
auto_link: bool = True,
|
||||
) -> ArchiveEntry:
|
||||
"""Update title, content, and/or metadata on an existing entry.
|
||||
|
||||
Bumps ``updated_at`` and re-runs auto-linking when content changes.
|
||||
|
||||
Args:
|
||||
entry_id: ID of the entry to update.
|
||||
title: New title, or None to leave unchanged.
|
||||
content: New content, or None to leave unchanged.
|
||||
metadata: Dict to merge into existing metadata (replaces keys present).
|
||||
auto_link: If True, re-run holographic linker after content change.
|
||||
|
||||
Returns:
|
||||
The updated ArchiveEntry.
|
||||
|
||||
Raises:
|
||||
KeyError: If entry_id does not exist.
|
||||
"""
|
||||
entry = self._entries.get(entry_id)
|
||||
if entry is None:
|
||||
raise KeyError(entry_id)
|
||||
|
||||
content_changed = False
|
||||
if title is not None and title != entry.title:
|
||||
entry.title = title
|
||||
content_changed = True
|
||||
if content is not None and content != entry.content:
|
||||
entry.content = content
|
||||
content_changed = True
|
||||
if metadata is not None:
|
||||
entry.metadata.update(metadata)
|
||||
|
||||
if content_changed:
|
||||
entry.content_hash = _compute_content_hash(entry.title, entry.content)
|
||||
|
||||
entry.updated_at = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
if content_changed and auto_link:
|
||||
# Clear old links from this entry and re-run linker
|
||||
for other in self._entries.values():
|
||||
if entry_id in other.links:
|
||||
other.links.remove(entry_id)
|
||||
entry.links = []
|
||||
self.linker.apply_links(entry, list(self._entries.values()))
|
||||
|
||||
self._save()
|
||||
return entry
|
||||
|
||||
def get(self, entry_id: str) -> Optional[ArchiveEntry]:
|
||||
return self._entries.get(entry_id)
|
||||
|
||||
|
||||
@@ -6,12 +6,19 @@ with metadata, content, and links to related entries.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
import uuid
|
||||
|
||||
|
||||
def _compute_content_hash(title: str, content: str) -> str:
|
||||
"""Compute SHA-256 of title+content for deduplication."""
|
||||
raw = f"{title}\x00{content}".encode("utf-8")
|
||||
return hashlib.sha256(raw).hexdigest()
|
||||
|
||||
|
||||
@dataclass
|
||||
class ArchiveEntry:
|
||||
"""A single node in the Mnemosyne holographic archive."""
|
||||
@@ -24,7 +31,13 @@ class ArchiveEntry:
|
||||
topics: list[str] = field(default_factory=list)
|
||||
metadata: dict = field(default_factory=dict)
|
||||
created_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
||||
updated_at: Optional[str] = None # Set on mutation; None means same as created_at
|
||||
links: list[str] = field(default_factory=list) # IDs of related entries
|
||||
content_hash: Optional[str] = None # SHA-256 of title+content for dedup
|
||||
|
||||
def __post_init__(self):
|
||||
if self.content_hash is None:
|
||||
self.content_hash = _compute_content_hash(self.title, self.content)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
@@ -36,7 +49,9 @@ class ArchiveEntry:
|
||||
"topics": self.topics,
|
||||
"metadata": self.metadata,
|
||||
"created_at": self.created_at,
|
||||
"updated_at": self.updated_at,
|
||||
"links": self.links,
|
||||
"content_hash": self.content_hash,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
|
||||
@@ -491,3 +491,178 @@ def test_tag_persistence_across_reload():
|
||||
fresh = a2.get(e.id)
|
||||
assert "beta" in fresh.topics
|
||||
assert "alpha" not in fresh.topics
|
||||
|
||||
|
||||
# --- content_hash and updated_at field tests ---
|
||||
|
||||
def test_entry_has_content_hash():
|
||||
e = ArchiveEntry(title="Hello", content="world")
|
||||
assert e.content_hash is not None
|
||||
assert len(e.content_hash) == 64 # SHA-256 hex
|
||||
|
||||
|
||||
def test_entry_content_hash_deterministic():
|
||||
e1 = ArchiveEntry(title="Hello", content="world")
|
||||
e2 = ArchiveEntry(title="Hello", content="world")
|
||||
assert e1.content_hash == e2.content_hash
|
||||
|
||||
|
||||
def test_entry_content_hash_differs_on_different_content():
|
||||
e1 = ArchiveEntry(title="Hello", content="world")
|
||||
e2 = ArchiveEntry(title="Hello", content="different")
|
||||
assert e1.content_hash != e2.content_hash
|
||||
|
||||
|
||||
def test_entry_updated_at_defaults_none():
|
||||
e = ArchiveEntry(title="T", content="c")
|
||||
assert e.updated_at is None
|
||||
|
||||
|
||||
def test_entry_roundtrip_includes_new_fields():
|
||||
e = ArchiveEntry(title="T", content="c")
|
||||
d = e.to_dict()
|
||||
assert "content_hash" in d
|
||||
assert "updated_at" in d
|
||||
e2 = ArchiveEntry.from_dict(d)
|
||||
assert e2.content_hash == e.content_hash
|
||||
assert e2.updated_at == e.updated_at
|
||||
|
||||
|
||||
# --- content deduplication tests ---
|
||||
|
||||
def test_add_deduplication_same_content():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
e1 = ingest_event(archive, title="Dup", content="Same content here")
|
||||
e2 = ingest_event(archive, title="Dup", content="Same content here")
|
||||
# Should NOT have created a second entry
|
||||
assert archive.count == 1
|
||||
assert e1.id == e2.id
|
||||
|
||||
|
||||
def test_add_deduplication_different_content():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
ingest_event(archive, title="A", content="Content one")
|
||||
ingest_event(archive, title="B", content="Content two")
|
||||
assert archive.count == 2
|
||||
|
||||
|
||||
def test_find_duplicate_returns_existing():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
e1 = ingest_event(archive, title="Dup", content="Same content here")
|
||||
probe = ArchiveEntry(title="Dup", content="Same content here")
|
||||
dup = archive.find_duplicate(probe)
|
||||
assert dup is not None
|
||||
assert dup.id == e1.id
|
||||
|
||||
|
||||
def test_find_duplicate_returns_none_for_unique():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
ingest_event(archive, title="A", content="Some content")
|
||||
probe = ArchiveEntry(title="B", content="Totally different content")
|
||||
assert archive.find_duplicate(probe) is None
|
||||
|
||||
|
||||
def test_find_duplicate_empty_archive():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
probe = ArchiveEntry(title="X", content="y")
|
||||
assert archive.find_duplicate(probe) is None
|
||||
|
||||
|
||||
# --- update_entry tests ---
|
||||
|
||||
def test_update_entry_title():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
e = ingest_event(archive, title="Old title", content="Some content")
|
||||
archive.update_entry(e.id, title="New title")
|
||||
fresh = archive.get(e.id)
|
||||
assert fresh.title == "New title"
|
||||
assert fresh.content == "Some content"
|
||||
|
||||
|
||||
def test_update_entry_content():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
e = ingest_event(archive, title="T", content="Old content")
|
||||
archive.update_entry(e.id, content="New content")
|
||||
fresh = archive.get(e.id)
|
||||
assert fresh.content == "New content"
|
||||
|
||||
|
||||
def test_update_entry_metadata():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
e = ingest_event(archive, title="T", content="c")
|
||||
archive.update_entry(e.id, metadata={"key": "value"})
|
||||
fresh = archive.get(e.id)
|
||||
assert fresh.metadata["key"] == "value"
|
||||
|
||||
|
||||
def test_update_entry_bumps_updated_at():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
e = ingest_event(archive, title="T", content="c")
|
||||
assert e.updated_at is None
|
||||
archive.update_entry(e.id, title="Updated")
|
||||
fresh = archive.get(e.id)
|
||||
assert fresh.updated_at is not None
|
||||
|
||||
|
||||
def test_update_entry_refreshes_content_hash():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
e = ingest_event(archive, title="T", content="Original content")
|
||||
old_hash = e.content_hash
|
||||
archive.update_entry(e.id, content="Completely new content")
|
||||
fresh = archive.get(e.id)
|
||||
assert fresh.content_hash != old_hash
|
||||
|
||||
|
||||
def test_update_entry_missing_raises():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
try:
|
||||
archive.update_entry("nonexistent-id", title="X")
|
||||
assert False, "Expected KeyError"
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
|
||||
def test_update_entry_persists_across_reload():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
a1 = MnemosyneArchive(archive_path=path)
|
||||
e = ingest_event(a1, title="Before", content="Before content")
|
||||
a1.update_entry(e.id, title="After", content="After content")
|
||||
|
||||
a2 = MnemosyneArchive(archive_path=path)
|
||||
fresh = a2.get(e.id)
|
||||
assert fresh.title == "After"
|
||||
assert fresh.content == "After content"
|
||||
assert fresh.updated_at is not None
|
||||
|
||||
|
||||
def test_update_entry_no_change_no_crash():
|
||||
"""Calling update_entry with all None args should not fail."""
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "test_archive.json"
|
||||
archive = MnemosyneArchive(archive_path=path)
|
||||
e = ingest_event(archive, title="T", content="c")
|
||||
result = archive.update_entry(e.id)
|
||||
assert result.title == "T"
|
||||
|
||||
Reference in New Issue
Block a user