the-nexus/nexus/mnemosyne/archive.py

"""MnemosyneArchive — core archive class.

The living holographic archive. Stores entries, maintains links,
and provides query interfaces for retrieving connected knowledge.
"""

from __future__ import annotations

import json
from pathlib import Path
from typing import Optional

from nexus.mnemosyne.entry import ArchiveEntry
from nexus.mnemosyne.linker import HolographicLinker

_EXPORT_VERSION = "1"


class MnemosyneArchive:
    """The holographic archive — stores and links entries.

    Phase 1 uses JSON file storage. Phase 2 will integrate with
    MemPalace (ChromaDB) for vector-semantic search.
    """

    def __init__(self, archive_path: Optional[Path] = None):
        self.path = archive_path or Path.home() / ".hermes" / "mnemosyne" / "archive.json"
        self.path.parent.mkdir(parents=True, exist_ok=True)
        self.linker = HolographicLinker()
        self._entries: dict[str, ArchiveEntry] = {}
        self._load()

    def _load(self):
        if self.path.exists():
            try:
                with open(self.path) as f:
                    data = json.load(f)
                for entry_data in data.get("entries", []):
                    entry = ArchiveEntry.from_dict(entry_data)
                    self._entries[entry.id] = entry
            except (json.JSONDecodeError, KeyError):
                pass  # Start fresh on corrupt data

    def _save(self):
        data = {
            "entries": [e.to_dict() for e in self._entries.values()],
            "count": len(self._entries),
        }
        with open(self.path, "w") as f:
            json.dump(data, f, indent=2)

    def add(self, entry: ArchiveEntry, auto_link: bool = True) -> ArchiveEntry:
        """Add an entry to the archive. Auto-links to related entries."""
        self._entries[entry.id] = entry
        if auto_link:
            self.linker.apply_links(entry, list(self._entries.values()))
        self._save()
        return entry

    def get(self, entry_id: str) -> Optional[ArchiveEntry]:
        return self._entries.get(entry_id)

    def search(self, query: str, limit: int = 10) -> list[ArchiveEntry]:
        """Simple keyword search across titles and content."""
        query_tokens = set(query.lower().split())
        scored = []
        for entry in self._entries.values():
            text = f"{entry.title} {entry.content} {' '.join(entry.topics)}".lower()
            hits = sum(1 for t in query_tokens if t in text)
            if hits > 0:
                scored.append((hits, entry))
        scored.sort(key=lambda x: x[0], reverse=True)
        return [e for _, e in scored[:limit]]

    def get_linked(self, entry_id: str, depth: int = 1) -> list[ArchiveEntry]:
        """Get entries linked to a given entry, up to specified depth."""
        visited = set()
        frontier = {entry_id}
        result = []
        for _ in range(depth):
            next_frontier = set()
            for eid in frontier:
                if eid in visited:
                    continue
                visited.add(eid)
                entry = self._entries.get(eid)
                if entry:
                    for linked_id in entry.links:
                        if linked_id not in visited:
                            linked = self._entries.get(linked_id)
                            if linked:
                                result.append(linked)
                                next_frontier.add(linked_id)
            frontier = next_frontier
        return result

    def by_topic(self, topic: str) -> list[ArchiveEntry]:
        """Get all entries tagged with a topic."""
        topic_lower = topic.lower()
        return [e for e in self._entries.values() if topic_lower in [t.lower() for t in e.topics]]

    def remove(self, entry_id: str) -> bool:
        """Remove an entry and clean up all bidirectional links.

        Returns True if the entry existed and was removed, False otherwise.
        """
        if entry_id not in self._entries:
            return False
        # Remove back-links from all other entries
        for other in self._entries.values():
            if entry_id in other.links:
                other.links.remove(entry_id)
        del self._entries[entry_id]
        self._save()
        return True

    def export(
        self,
        query: Optional[str] = None,
        topics: Optional[list[str]] = None,
    ) -> dict:
        """Export a filtered subset of the archive.

        Args:
            query: keyword filter applied to title + content (case-insensitive)
            topics: list of topic tags; entries must match at least one

        Returns a JSON-serialisable dict with an ``entries`` list and metadata.
        """
        candidates = list(self._entries.values())

        if topics:
            lower_topics = {t.lower() for t in topics}
            candidates = [
                e for e in candidates
                if any(t.lower() in lower_topics for t in e.topics)
            ]

        if query:
            query_tokens = set(query.lower().split())
            candidates = [
                e for e in candidates
                if any(
                    token in f"{e.title} {e.content} {' '.join(e.topics)}".lower()
                    for token in query_tokens
                )
            ]

        return {
            "version": _EXPORT_VERSION,
            "filters": {"query": query, "topics": topics},
            "count": len(candidates),
            "entries": [e.to_dict() for e in candidates],
        }

    def topic_counts(self) -> dict[str, int]:
        """Return a dict mapping topic name → entry count, sorted by count desc."""
        counts: dict[str, int] = {}
        for entry in self._entries.values():
            for topic in entry.topics:
                counts[topic] = counts.get(topic, 0) + 1
        return dict(sorted(counts.items(), key=lambda x: x[1], reverse=True))

    @property
    def count(self) -> int:
        return len(self._entries)

    def stats(self) -> dict:
        entries = list(self._entries.values())
        total_links = sum(len(e.links) for e in entries)
        topics: set[str] = set()
        for e in entries:
            topics.update(e.topics)

        # Orphans: entries with no links at all
        orphans = sum(1 for e in entries if len(e.links) == 0)

        # Link density: average links per entry (0 when empty)
        n = len(entries)
        link_density = round(total_links / n, 4) if n else 0.0

        # Age distribution
        timestamps = sorted(e.created_at for e in entries)
        oldest_entry = timestamps[0] if timestamps else None
        newest_entry = timestamps[-1] if timestamps else None

        return {
            "entries": n,
            "total_links": total_links,
            "unique_topics": len(topics),
            "topics": sorted(topics),
            "orphans": orphans,
            "link_density": link_density,
            "oldest_entry": oldest_entry,
            "newest_entry": newest_entry,
        }