Compare commits

...

5 Commits

Author SHA1 Message Date
Alexander Whitestone
badb1f2b93 chore: remove pycache from tracking
Some checks failed
CI / test (pull_request) Failing after 10s
CI / validate (pull_request) Failing after 14s
Review Approval Gate / verify-review (pull_request) Failing after 3s
2026-04-11 19:40:18 -04:00
Alexander Whitestone
238a340251 feat(mnemosyne): add entry update, content dedup, and find_duplicates
Closes #1239

- ArchiveEntry: add updated_at field, content_hash property, touch()
- MnemosyneArchive.update_entry(): update title/content/metadata, auto re-link
- MnemosyneArchive.find_by_hash(): lookup by SHA-256 of title+content
- MnemosyneArchive.find_duplicates(): find groups of identical entries
- MnemosyneArchive.add(skip_dups=True): skip duplicate entries on ingest
- Backward compat: from_dict backfills updated_at for legacy entries
- 18 new tests, all passing
2026-04-11 19:40:05 -04:00
Alexander Whitestone
a896d58d93 wip: add tests for update_entry, find_duplicates, skip_dups, content_hash 2026-04-11 19:39:11 -04:00
Alexander Whitestone
5e26ee0a7d wip: add update_entry, find_by_hash, find_duplicates, skip_dups to archive 2026-04-11 19:38:17 -04:00
Alexander Whitestone
d7343d1be2 wip: add updated_at, content_hash, touch() to ArchiveEntry 2026-04-11 19:37:39 -04:00
12 changed files with 303 additions and 3 deletions

1
.gitignore vendored
View File

@@ -8,3 +8,4 @@ mempalace/__pycache__/
# Prevent agents from writing to wrong path (see issue #1145) # Prevent agents from writing to wrong path (see issue #1145)
public/nexus/ public/nexus/
test-screenshots/ test-screenshots/
__pycache__/

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -49,8 +49,22 @@ class MnemosyneArchive:
with open(self.path, "w") as f: with open(self.path, "w") as f:
json.dump(data, f, indent=2) json.dump(data, f, indent=2)
def add(self, entry: ArchiveEntry, auto_link: bool = True) -> ArchiveEntry: def add(self, entry: ArchiveEntry, auto_link: bool = True, skip_dups: bool = False) -> ArchiveEntry:
"""Add an entry to the archive. Auto-links to related entries.""" """Add an entry to the archive. Auto-links to related entries.
Args:
entry: The entry to add.
auto_link: Whether to automatically compute holographic links.
skip_dups: If True, return existing entry instead of adding a duplicate
(same title+content hash).
Returns:
The added (or existing, if skip_dups=True and duplicate found) entry.
"""
if skip_dups:
existing = self.find_by_hash(entry.content_hash)
if existing:
return existing
self._entries[entry.id] = entry self._entries[entry.id] = entry
if auto_link: if auto_link:
self.linker.apply_links(entry, list(self._entries.values())) self.linker.apply_links(entry, list(self._entries.values()))
@@ -581,6 +595,83 @@ class MnemosyneArchive:
self._save() self._save()
return entry return entry
def update_entry(
self,
entry_id: str,
title: Optional[str] = None,
content: Optional[str] = None,
metadata: Optional[dict] = None,
re_link: bool = True,
) -> ArchiveEntry:
"""Update fields on an existing entry.
Only provided fields are changed. Bumps updated_at and optionally
recomputes holographic links (since content changed).
Args:
entry_id: ID of the entry to update.
title: New title (None = keep existing).
content: New content (None = keep existing).
metadata: New metadata dict (None = keep existing, {} to clear).
re_link: Whether to recompute holographic links after update.
Returns:
The updated ArchiveEntry.
Raises:
KeyError: If entry_id does not exist.
"""
entry = self._entries.get(entry_id)
if entry is None:
raise KeyError(entry_id)
old_hash = entry.content_hash
if title is not None:
entry.title = title
if content is not None:
entry.content = content
if metadata is not None:
entry.metadata = metadata
entry.touch()
# Re-link only if content actually changed
if re_link and entry.content_hash != old_hash:
# Clear existing links to this entry from others
for other in self._entries.values():
if entry_id in other.links:
other.links.remove(entry_id)
entry.links = []
# Re-apply
self.linker.apply_links(entry, list(self._entries.values()))
self._save()
return entry
def find_by_hash(self, content_hash: str) -> Optional[ArchiveEntry]:
"""Find an entry by its content hash (title + content SHA-256).
Returns the first match, or None if no entry has this hash.
"""
for entry in self._entries.values():
if entry.content_hash == content_hash:
return entry
return None
def find_duplicates(self) -> list[list[ArchiveEntry]]:
"""Find groups of entries with identical content hashes.
Returns a list of groups, where each group is a list of 2+ entries
sharing the same title+content. Sorted by group size descending.
"""
hash_groups: dict[str, list[ArchiveEntry]] = {}
for entry in self._entries.values():
h = entry.content_hash
hash_groups.setdefault(h, []).append(entry)
dups = [group for group in hash_groups.values() if len(group) > 1]
dups.sort(key=lambda g: len(g), reverse=True)
return dups
def rebuild_links(self, threshold: Optional[float] = None) -> int: def rebuild_links(self, threshold: Optional[float] = None) -> int:
"""Recompute all links from scratch. """Recompute all links from scratch.

View File

@@ -6,6 +6,7 @@ with metadata, content, and links to related entries.
from __future__ import annotations from __future__ import annotations
import hashlib
from dataclasses import dataclass, field from dataclasses import dataclass, field
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import Optional from typing import Optional
@@ -24,8 +25,19 @@ class ArchiveEntry:
topics: list[str] = field(default_factory=list) topics: list[str] = field(default_factory=list)
metadata: dict = field(default_factory=dict) metadata: dict = field(default_factory=dict)
created_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat()) created_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
updated_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
links: list[str] = field(default_factory=list) # IDs of related entries links: list[str] = field(default_factory=list) # IDs of related entries
@property
def content_hash(self) -> str:
"""SHA-256 hash of title + content for dedup detection."""
raw = f"{self.title}\x00{self.content}".encode()
return hashlib.sha256(raw).hexdigest()
def touch(self):
"""Bump updated_at to now."""
self.updated_at = datetime.now(timezone.utc).isoformat()
def to_dict(self) -> dict: def to_dict(self) -> dict:
return { return {
"id": self.id, "id": self.id,
@@ -36,9 +48,16 @@ class ArchiveEntry:
"topics": self.topics, "topics": self.topics,
"metadata": self.metadata, "metadata": self.metadata,
"created_at": self.created_at, "created_at": self.created_at,
"updated_at": self.updated_at,
"links": self.links, "links": self.links,
"content_hash": self.content_hash,
} }
@classmethod @classmethod
def from_dict(cls, data: dict) -> ArchiveEntry: def from_dict(cls, data: dict) -> ArchiveEntry:
return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__}) # Strip non-field keys (like content_hash which is computed)
filtered = {k: v for k, v in data.items() if k in cls.__dataclass_fields__}
# Backfill updated_at for legacy entries that lack it
if "updated_at" not in filtered:
filtered["updated_at"] = filtered.get("created_at", datetime.now(timezone.utc).isoformat())
return cls(**filtered)

View File

@@ -491,3 +491,192 @@ def test_tag_persistence_across_reload():
fresh = a2.get(e.id) fresh = a2.get(e.id)
assert "beta" in fresh.topics assert "beta" in fresh.topics
assert "alpha" not in fresh.topics assert "alpha" not in fresh.topics
# --- Entry update + dedup tests ---
def test_content_hash_deterministic():
e1 = ArchiveEntry(title="Test", content="Hello")
e2 = ArchiveEntry(title="Test", content="Hello")
assert e1.content_hash == e2.content_hash
def test_content_hash_differs_on_change():
e = ArchiveEntry(title="Test", content="Hello")
h1 = e.content_hash
e.content = "World"
assert e.content_hash != h1
def test_updated_at_set_on_creation():
e = ArchiveEntry(title="T", content="c")
assert e.updated_at is not None
assert e.updated_at >= e.created_at
def test_touch_updates_timestamp():
import time
e = ArchiveEntry(title="T", content="c")
before = e.updated_at
time.sleep(0.01)
e.touch()
assert e.updated_at >= before
def test_update_entry_title():
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "test_archive.json"
archive = MnemosyneArchive(archive_path=path)
e = ingest_event(archive, title="Old", content="content", topics=["x"])
old_hash = e.content_hash
updated = archive.update_entry(e.id, title="New Title")
assert updated.title == "New Title"
assert updated.content == "content"
assert updated.updated_at >= e.created_at
# Content unchanged, so hash should be same (only title changed)
assert updated.content_hash != old_hash # title is in hash
def test_update_entry_content_relinks():
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "test_archive.json"
archive = MnemosyneArchive(archive_path=path)
e1 = ingest_event(archive, title="Python", content="Python programming language")
e2 = ingest_event(archive, title="Java", content="Java programming language")
# e1 and e2 should be linked via shared tokens
assert e2.id in e1.links or e1.id in e2.links
# Update e1 to completely different content
archive.update_entry(e1.id, content="Cooking recipes for dinner")
e1_fresh = archive.get(e1.id)
e2_fresh = archive.get(e2.id)
# e1 should have been re-linked (likely unlinked from e2 now)
# e2 should no longer reference e1
assert e1_fresh.content == "Cooking recipes for dinner"
def test_update_entry_metadata():
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "test_archive.json"
archive = MnemosyneArchive(archive_path=path)
e = ingest_event(archive, title="T", content="c")
archive.update_entry(e.id, metadata={"key": "value"})
fresh = archive.get(e.id)
assert fresh.metadata == {"key": "value"}
def test_update_entry_missing_raises():
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "test_archive.json"
archive = MnemosyneArchive(archive_path=path)
try:
archive.update_entry("nonexistent", title="X")
assert False, "Expected KeyError"
except KeyError:
pass
def test_update_entry_no_change_no_relink():
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "test_archive.json"
archive = MnemosyneArchive(archive_path=path)
e = ingest_event(archive, title="T", content="c", topics=["x"])
orig_links = list(e.links)
# Update only metadata (no content change)
archive.update_entry(e.id, metadata={"k": "v"})
fresh = archive.get(e.id)
assert fresh.links == orig_links
def test_find_by_hash():
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "test_archive.json"
archive = MnemosyneArchive(archive_path=path)
e = ingest_event(archive, title="Unique", content="Unique content xyz")
found = archive.find_by_hash(e.content_hash)
assert found is not None
assert found.id == e.id
def test_find_by_hash_miss():
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "test_archive.json"
archive = MnemosyneArchive(archive_path=path)
found = archive.find_by_hash("nonexistent-hash")
assert found is None
def test_find_duplicates():
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "test_archive.json"
archive = MnemosyneArchive(archive_path=path)
e1 = ingest_event(archive, title="Same", content="Duplicate content")
# Manually add a second entry with identical title+content
e2 = ArchiveEntry(title="Same", content="Duplicate content", source="manual")
archive._entries[e2.id] = e2
archive._save()
dups = archive.find_duplicates()
assert len(dups) == 1
assert len(dups[0]) == 2
dup_ids = {d.id for d in dups[0]}
assert e1.id in dup_ids
assert e2.id in dup_ids
def test_find_duplicates_none():
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "test_archive.json"
archive = MnemosyneArchive(archive_path=path)
ingest_event(archive, title="A", content="unique a")
ingest_event(archive, title="B", content="unique b")
dups = archive.find_duplicates()
assert dups == []
def test_add_skip_dups():
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "test_archive.json"
archive = MnemosyneArchive(archive_path=path)
e1 = ingest_event(archive, title="Test", content="Content here")
# Try to add exact same entry with skip_dups=True
e2 = ArchiveEntry(title="Test", content="Content here")
result = archive.add(e2, skip_dups=True)
assert result.id == e1.id # returned existing, not new
assert archive.count == 1
def test_add_skip_dups_allows_different():
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "test_archive.json"
archive = MnemosyneArchive(archive_path=path)
e1 = ingest_event(archive, title="A", content="Content A")
e2 = ArchiveEntry(title="B", content="Content B")
result = archive.add(e2, skip_dups=True)
assert result.id == e2.id # new entry added
assert archive.count == 2
def test_entry_roundtrip_with_updated_at():
e = ArchiveEntry(title="T", content="c", topics=["x"])
d = e.to_dict()
e2 = ArchiveEntry.from_dict(d)
assert e2.updated_at == e.updated_at
assert "content_hash" in d
def test_entry_from_dict_backfills_updated_at():
"""Legacy entries without updated_at should get it from created_at."""
data = {
"id": "test-id",
"title": "Legacy",
"content": "old entry",
"source": "manual",
"source_ref": None,
"topics": [],
"metadata": {},
"created_at": "2025-01-01T00:00:00+00:00",
"links": [],
}
e = ArchiveEntry.from_dict(data)
assert e.updated_at == "2025-01-01T00:00:00+00:00"