test(mnemosyne): graph cluster analysis tests — 22 tests

- graph_clusters: empty, orphans, linked pairs, separate clusters, topics, density - hub_entries: ordering, limit, inbound/outbound counting - bridge_entries: triangle (none), chain (B is bridge), small cluster filtered - rebuild_links: creates links, threshold override, persistence
feat(mnemosyne): CLI commands for graph analysis
2026-04-11 18:44:58 -04:00 · 2026-04-11 18:43:14 -04:00 · 2026-04-11 18:42:32 -04:00
3 changed files with 585 additions and 1 deletions
--- a/nexus/mnemosyne/archive.py
+++ b/nexus/mnemosyne/archive.py
@@ -241,3 +241,247 @@ class MnemosyneArchive:
            "oldest_entry": oldest_entry,
            "newest_entry": newest_entry,
        }
+
+    def _build_adjacency(self) -> dict[str, set[str]]:
+        """Build adjacency dict from entry links. Only includes valid references."""
+        adj: dict[str, set[str]] = {eid: set() for eid in self._entries}
+        for eid, entry in self._entries.items():
+            for linked_id in entry.links:
+                if linked_id in self._entries and linked_id != eid:
+                    adj[eid].add(linked_id)
+                    adj[linked_id].add(eid)
+        return adj
+
+    def graph_clusters(self, min_size: int = 1) -> list[dict]:
+        """Find connected component clusters in the holographic graph.
+
+        Uses BFS to discover groups of entries that are reachable from each
+        other through their links. Returns clusters sorted by size descending.
+
+        Args:
+            min_size: Minimum cluster size to include (filters out isolated entries).
+
+        Returns:
+            List of dicts with keys: cluster_id, size, entries, topics, density
+        """
+        adj = self._build_adjacency()
+        visited: set[str] = set()
+        clusters: list[dict] = []
+        cluster_id = 0
+
+        for eid in self._entries:
+            if eid in visited:
+                continue
+            # BFS from this entry
+            component: list[str] = []
+            queue = [eid]
+            while queue:
+                current = queue.pop(0)
+                if current in visited:
+                    continue
+                visited.add(current)
+                component.append(current)
+                for neighbor in adj.get(current, set()):
+                    if neighbor not in visited:
+                        queue.append(neighbor)
+
+            # Single-entry clusters are orphans
+            if len(component) < min_size:
+                continue
+
+            # Collect topics from cluster entries
+            cluster_topics: dict[str, int] = {}
+            internal_edges = 0
+            for cid in component:
+                entry = self._entries[cid]
+                for t in entry.topics:
+                    cluster_topics[t] = cluster_topics.get(t, 0) + 1
+                internal_edges += len(adj.get(cid, set()))
+            internal_edges //= 2  # undirected, counted twice
+
+            # Density: actual edges / possible edges
+            n = len(component)
+            max_edges = n * (n - 1) // 2
+            density = round(internal_edges / max_edges, 4) if max_edges > 0 else 0.0
+
+            # Top topics by frequency
+            top_topics = sorted(cluster_topics.items(), key=lambda x: x[1], reverse=True)[:5]
+
+            clusters.append({
+                "cluster_id": cluster_id,
+                "size": n,
+                "entries": component,
+                "top_topics": [t for t, _ in top_topics],
+                "internal_edges": internal_edges,
+                "density": density,
+            })
+            cluster_id += 1
+
+        clusters.sort(key=lambda c: c["size"], reverse=True)
+        return clusters
+
+    def hub_entries(self, limit: int = 10) -> list[dict]:
+        """Find the most connected entries (highest degree centrality).
+
+        These are the "hubs" of the holographic graph — entries that bridge
+        many topics and attract many links.
+
+        Args:
+            limit: Maximum number of hubs to return.
+
+        Returns:
+            List of dicts with keys: entry, degree, inbound, outbound, topics
+        """
+        adj = self._build_adjacency()
+        inbound: dict[str, int] = {eid: 0 for eid in self._entries}
+
+        for entry in self._entries.values():
+            for lid in entry.links:
+                if lid in inbound:
+                    inbound[lid] += 1
+
+        hubs = []
+        for eid, entry in self._entries.items():
+            degree = len(adj.get(eid, set()))
+            if degree == 0:
+                continue
+            hubs.append({
+                "entry": entry,
+                "degree": degree,
+                "inbound": inbound.get(eid, 0),
+                "outbound": len(entry.links),
+                "topics": entry.topics,
+            })
+
+        hubs.sort(key=lambda h: h["degree"], reverse=True)
+        return hubs[:limit]
+
+    def bridge_entries(self) -> list[dict]:
+        """Find articulation points — entries whose removal would split a cluster.
+
+        These are "bridge" entries in the holographic graph. Removing them
+        disconnects members that were previously reachable through the bridge.
+        Uses Tarjan's algorithm for finding articulation points.
+
+        Returns:
+            List of dicts with keys: entry, cluster_size, bridges_between
+        """
+        adj = self._build_adjacency()
+
+        # Find clusters first
+        clusters = self.graph_clusters(min_size=3)
+        if not clusters:
+            return []
+
+        # For each cluster, run Tarjan's algorithm
+        bridges: list[dict] = []
+        for cluster in clusters:
+            members = set(cluster["entries"])
+            if len(members) < 3:
+                continue
+
+            # Build subgraph adjacency
+            sub_adj = {eid: adj[eid] & members for eid in members}
+
+            # Tarjan's DFS for articulation points
+            discovery: dict[str, int] = {}
+            low: dict[str, int] = {}
+            parent: dict[str, Optional[str]] = {}
+            ap: set[str] = set()
+            timer = [0]
+
+            def dfs(u: str):
+                children = 0
+                discovery[u] = low[u] = timer[0]
+                timer[0] += 1
+                for v in sub_adj[u]:
+                    if v not in discovery:
+                        children += 1
+                        parent[v] = u
+                        dfs(v)
+                        low[u] = min(low[u], low[v])
+
+                        # u is AP if: root with 2+ children, or non-root with low[v] >= disc[u]
+                        if parent.get(u) is None and children > 1:
+                            ap.add(u)
+                        if parent.get(u) is not None and low[v] >= discovery[u]:
+                            ap.add(u)
+                    elif v != parent.get(u):
+                        low[u] = min(low[u], discovery[v])
+
+            for eid in members:
+                if eid not in discovery:
+                    parent[eid] = None
+                    dfs(eid)
+
+            # For each articulation point, estimate what it bridges
+            for ap_id in ap:
+                ap_entry = self._entries[ap_id]
+                # Remove it temporarily and count resulting components
+                temp_adj = {k: v.copy() for k, v in sub_adj.items()}
+                del temp_adj[ap_id]
+                for k in temp_adj:
+                    temp_adj[k].discard(ap_id)
+
+                # BFS count components after removal
+                temp_visited: set[str] = set()
+                component_count = 0
+                for mid in members:
+                    if mid == ap_id or mid in temp_visited:
+                        continue
+                    component_count += 1
+                    queue = [mid]
+                    while queue:
+                        cur = queue.pop(0)
+                        if cur in temp_visited:
+                            continue
+                        temp_visited.add(cur)
+                        for nb in temp_adj.get(cur, set()):
+                            if nb not in temp_visited:
+                                queue.append(nb)
+
+                if component_count > 1:
+                    bridges.append({
+                        "entry": ap_entry,
+                        "cluster_size": cluster["size"],
+                        "components_after_removal": component_count,
+                        "topics": ap_entry.topics,
+                    })
+
+        bridges.sort(key=lambda b: b["components_after_removal"], reverse=True)
+        return bridges
+
+    def rebuild_links(self, threshold: Optional[float] = None) -> int:
+        """Recompute all links from scratch.
+
+        Clears existing links and re-applies the holographic linker to every
+        entry pair. Useful after bulk ingestion or threshold changes.
+
+        Args:
+            threshold: Override the linker's default similarity threshold.
+
+        Returns:
+            Total number of links created.
+        """
+        if threshold is not None:
+            old_threshold = self.linker.threshold
+            self.linker.threshold = threshold
+
+        # Clear all links
+        for entry in self._entries.values():
+            entry.links = []
+
+        entries = list(self._entries.values())
+        total_links = 0
+
+        # Re-link each entry against all others
+        for entry in entries:
+            candidates = [e for e in entries if e.id != entry.id]
+            new_links = self.linker.apply_links(entry, candidates)
+            total_links += new_links
+
+        if threshold is not None:
+            self.linker.threshold = old_threshold
+
+        self._save()
+        return total_links
--- a/nexus/mnemosyne/cli.py
+++ b/nexus/mnemosyne/cli.py
@@ -1,7 +1,8 @@
 """CLI interface for Mnemosyne.

 Provides: mnemosyne ingest, mnemosyne search, mnemosyne link, mnemosyne stats,
-          mnemosyne topics, mnemosyne remove, mnemosyne export
+          mnemosyne topics, mnemosyne remove, mnemosyne export,
+          mnemosyne clusters, mnemosyne hubs, mnemosyne bridges, mnemosyne rebuild
 """

 from __future__ import annotations
@@ -90,6 +91,58 @@ def cmd_export(args):
    print(json.dumps(data, indent=2))


+def cmd_clusters(args):
+    archive = MnemosyneArchive()
+    clusters = archive.graph_clusters(min_size=args.min_size)
+    if not clusters:
+        print("No clusters found.")
+        return
+    for c in clusters:
+        print(f"Cluster {c['cluster_id']}: {c['size']} entries, density={c['density']}")
+        print(f"  Topics: {', '.join(c['top_topics']) if c['top_topics'] else '(none)'}")
+        if args.verbose:
+            for eid in c["entries"]:
+                entry = archive.get(eid)
+                if entry:
+                    print(f"    [{eid[:8]}] {entry.title}")
+        print()
+
+
+def cmd_hubs(args):
+    archive = MnemosyneArchive()
+    hubs = archive.hub_entries(limit=args.limit)
+    if not hubs:
+        print("No hubs found.")
+        return
+    for h in hubs:
+        e = h["entry"]
+        print(f"[{e.id[:8]}] {e.title}")
+        print(f"  Degree: {h['degree']} (in: {h['inbound']}, out: {h['outbound']})")
+        print(f"  Topics: {', '.join(h['topics']) if h['topics'] else '(none)'}")
+        print()
+
+
+def cmd_bridges(args):
+    archive = MnemosyneArchive()
+    bridges = archive.bridge_entries()
+    if not bridges:
+        print("No bridge entries found.")
+        return
+    for b in bridges:
+        e = b["entry"]
+        print(f"[{e.id[:8]}] {e.title}")
+        print(f"  Bridges {b['components_after_removal']} components (cluster: {b['cluster_size']} entries)")
+        print(f"  Topics: {', '.join(b['topics']) if b['topics'] else '(none)'}")
+        print()
+
+
+def cmd_rebuild(args):
+    archive = MnemosyneArchive()
+    threshold = args.threshold if args.threshold else None
+    total = archive.rebuild_links(threshold=threshold)
+    print(f"Rebuilt links: {total} connections across {archive.count} entries")
+
+
 def main():
    parser = argparse.ArgumentParser(prog="mnemosyne", description="The Living Holographic Archive")
    sub = parser.add_subparsers(dest="command")
@@ -119,6 +172,18 @@ def main():
    ex.add_argument("-q", "--query", default="", help="Keyword filter")
    ex.add_argument("-t", "--topics", default="", help="Comma-separated topic filter")

+    cl = sub.add_parser("clusters", help="Show graph clusters (connected components)")
+    cl.add_argument("-m", "--min-size", type=int, default=1, help="Minimum cluster size")
+    cl.add_argument("-v", "--verbose", action="store_true", help="List entries in each cluster")
+
+    hu = sub.add_parser("hubs", help="Show most connected entries (hub analysis)")
+    hu.add_argument("-n", "--limit", type=int, default=10, help="Max hubs to show")
+
+    sub.add_parser("bridges", help="Show bridge entries (articulation points)")
+
+    rb = sub.add_parser("rebuild", help="Recompute all links from scratch")
+    rb.add_argument("-t", "--threshold", type=float, default=None, help="Similarity threshold override")
+
    args = parser.parse_args()
    if not args.command:
        parser.print_help()
@@ -132,6 +197,10 @@ def main():
        "topics": cmd_topics,
        "remove": cmd_remove,
        "export": cmd_export,
+        "clusters": cmd_clusters,
+        "hubs": cmd_hubs,
+        "bridges": cmd_bridges,
+        "rebuild": cmd_rebuild,
    }
    dispatch[args.command](args)

--- a/nexus/mnemosyne/tests/test_graph_clusters.py
+++ b/nexus/mnemosyne/tests/test_graph_clusters.py
@@ -0,0 +1,271 @@
+"""Tests for Mnemosyne graph cluster analysis features.
+
+Tests: graph_clusters, hub_entries, bridge_entries, rebuild_links.
+"""
+
+import pytest
+from pathlib import Path
+import tempfile
+
+from nexus.mnemosyne.archive import MnemosyneArchive
+from nexus.mnemosyne.entry import ArchiveEntry
+
+
+@pytest.fixture
+def archive():
+    """Create a fresh archive in a temp directory."""
+    with tempfile.TemporaryDirectory() as tmp:
+        path = Path(tmp) / "test_archive.json"
+        a = MnemosyneArchive(archive_path=path)
+        yield a
+
+
+def _make_entry(title="Test", content="test content", topics=None):
+    return ArchiveEntry(title=title, content=content, topics=topics or [])
+
+
+class TestGraphClusters:
+    """Test graph_clusters() connected component discovery."""
+
+    def test_empty_archive(self, archive):
+        clusters = archive.graph_clusters()
+        assert clusters == []
+
+    def test_single_orphan(self, archive):
+        archive.add(_make_entry("Lone entry"), auto_link=False)
+        # min_size=1 includes orphans
+        clusters = archive.graph_clusters(min_size=1)
+        assert len(clusters) == 1
+        assert clusters[0]["size"] == 1
+        assert clusters[0]["density"] == 0.0
+
+    def test_single_orphan_filtered(self, archive):
+        archive.add(_make_entry("Lone entry"), auto_link=False)
+        clusters = archive.graph_clusters(min_size=2)
+        assert clusters == []
+
+    def test_two_linked_entries(self, archive):
+        """Two manually linked entries form a cluster."""
+        e1 = archive.add(_make_entry("Alpha dogs", "canine training"), auto_link=False)
+        e2 = archive.add(_make_entry("Beta cats", "feline behavior"), auto_link=False)
+        # Manual link
+        e1.links.append(e2.id)
+        e2.links.append(e1.id)
+        archive._save()
+
+        clusters = archive.graph_clusters(min_size=2)
+        assert len(clusters) == 1
+        assert clusters[0]["size"] == 2
+        assert clusters[0]["internal_edges"] == 1
+        assert clusters[0]["density"] == 1.0  # 1 edge out of 1 possible
+
+    def test_two_separate_clusters(self, archive):
+        """Two disconnected groups form separate clusters."""
+        a1 = archive.add(_make_entry("AI models", "neural networks"), auto_link=False)
+        a2 = archive.add(_make_entry("AI training", "gradient descent"), auto_link=False)
+        b1 = archive.add(_make_entry("Cooking pasta", "italian recipes"), auto_link=False)
+        b2 = archive.add(_make_entry("Cooking sauces", "tomato basil"), auto_link=False)
+
+        # Link cluster A
+        a1.links.append(a2.id)
+        a2.links.append(a1.id)
+        # Link cluster B
+        b1.links.append(b2.id)
+        b2.links.append(b1.id)
+        archive._save()
+
+        clusters = archive.graph_clusters(min_size=2)
+        assert len(clusters) == 2
+        sizes = sorted(c["size"] for c in clusters)
+        assert sizes == [2, 2]
+
+    def test_cluster_topics(self, archive):
+        """Cluster includes aggregated topics."""
+        e1 = archive.add(_make_entry("Alpha", "content", topics=["ai", "models"]), auto_link=False)
+        e2 = archive.add(_make_entry("Beta", "content", topics=["ai", "training"]), auto_link=False)
+        e1.links.append(e2.id)
+        e2.links.append(e1.id)
+        archive._save()
+
+        clusters = archive.graph_clusters(min_size=2)
+        assert "ai" in clusters[0]["top_topics"]
+
+    def test_density_calculation(self, archive):
+        """Triangle (3 nodes, 3 edges) has density 1.0."""
+        e1 = archive.add(_make_entry("A", "aaa"), auto_link=False)
+        e2 = archive.add(_make_entry("B", "bbb"), auto_link=False)
+        e3 = archive.add(_make_entry("C", "ccc"), auto_link=False)
+        # Fully connected triangle
+        for e, others in [(e1, [e2, e3]), (e2, [e1, e3]), (e3, [e1, e2])]:
+            for o in others:
+                e.links.append(o.id)
+        archive._save()
+
+        clusters = archive.graph_clusters(min_size=2)
+        assert len(clusters) == 1
+        assert clusters[0]["internal_edges"] == 3
+        assert clusters[0]["density"] == 1.0  # 3 edges / 3 possible
+
+    def test_chain_density(self, archive):
+        """A-B-C chain has density 2/3 (2 edges out of 3 possible)."""
+        e1 = archive.add(_make_entry("A", "aaa"), auto_link=False)
+        e2 = archive.add(_make_entry("B", "bbb"), auto_link=False)
+        e3 = archive.add(_make_entry("C", "ccc"), auto_link=False)
+        # Chain: A-B-C
+        e1.links.append(e2.id)
+        e2.links.extend([e1.id, e3.id])
+        e3.links.append(e2.id)
+        archive._save()
+
+        clusters = archive.graph_clusters(min_size=2)
+        assert abs(clusters[0]["density"] - 2/3) < 0.01
+
+
+class TestHubEntries:
+    """Test hub_entries() degree centrality ranking."""
+
+    def test_empty(self, archive):
+        assert archive.hub_entries() == []
+
+    def test_no_links(self, archive):
+        archive.add(_make_entry("Lone"), auto_link=False)
+        assert archive.hub_entries() == []
+
+    def test_hub_ordering(self, archive):
+        """Entry with most links is ranked first."""
+        e1 = archive.add(_make_entry("Hub", "central node"), auto_link=False)
+        e2 = archive.add(_make_entry("Spoke 1", "content"), auto_link=False)
+        e3 = archive.add(_make_entry("Spoke 2", "content"), auto_link=False)
+        e4 = archive.add(_make_entry("Spoke 3", "content"), auto_link=False)
+
+        # e1 connects to all spokes
+        e1.links.extend([e2.id, e3.id, e4.id])
+        e2.links.append(e1.id)
+        e3.links.append(e1.id)
+        e4.links.append(e1.id)
+        archive._save()
+
+        hubs = archive.hub_entries()
+        assert len(hubs) == 4
+        assert hubs[0]["entry"].id == e1.id
+        assert hubs[0]["degree"] == 3
+
+    def test_limit(self, archive):
+        e1 = archive.add(_make_entry("A", ""), auto_link=False)
+        e2 = archive.add(_make_entry("B", ""), auto_link=False)
+        e1.links.append(e2.id)
+        e2.links.append(e1.id)
+        archive._save()
+
+        assert len(archive.hub_entries(limit=1)) == 1
+
+    def test_inbound_outbound(self, archive):
+        """Inbound counts links TO an entry, outbound counts links FROM it."""
+        e1 = archive.add(_make_entry("Source", ""), auto_link=False)
+        e2 = archive.add(_make_entry("Target", ""), auto_link=False)
+        # Only e1 links to e2
+        e1.links.append(e2.id)
+        archive._save()
+
+        hubs = archive.hub_entries()
+        h1 = next(h for h in hubs if h["entry"].id == e1.id)
+        h2 = next(h for h in hubs if h["entry"].id == e2.id)
+        assert h1["inbound"] == 0
+        assert h1["outbound"] == 1
+        assert h2["inbound"] == 1
+        assert h2["outbound"] == 0
+
+
+class TestBridgeEntries:
+    """Test bridge_entries() articulation point detection."""
+
+    def test_empty(self, archive):
+        assert archive.bridge_entries() == []
+
+    def test_no_bridges_in_triangle(self, archive):
+        """Fully connected triangle has no articulation points."""
+        e1 = archive.add(_make_entry("A", ""), auto_link=False)
+        e2 = archive.add(_make_entry("B", ""), auto_link=False)
+        e3 = archive.add(_make_entry("C", ""), auto_link=False)
+        for e, others in [(e1, [e2, e3]), (e2, [e1, e3]), (e3, [e1, e2])]:
+            for o in others:
+                e.links.append(o.id)
+        archive._save()
+
+        assert archive.bridge_entries() == []
+
+    def test_bridge_in_chain(self, archive):
+        """A-B-C chain: B is the articulation point."""
+        e1 = archive.add(_make_entry("A", ""), auto_link=False)
+        e2 = archive.add(_make_entry("B", ""), auto_link=False)
+        e3 = archive.add(_make_entry("C", ""), auto_link=False)
+        e1.links.append(e2.id)
+        e2.links.extend([e1.id, e3.id])
+        e3.links.append(e2.id)
+        archive._save()
+
+        bridges = archive.bridge_entries()
+        assert len(bridges) == 1
+        assert bridges[0]["entry"].id == e2.id
+        assert bridges[0]["components_after_removal"] == 2
+
+    def test_no_bridges_in_small_cluster(self, archive):
+        """Two-node clusters are too small for bridge detection."""
+        e1 = archive.add(_make_entry("A", ""), auto_link=False)
+        e2 = archive.add(_make_entry("B", ""), auto_link=False)
+        e1.links.append(e2.id)
+        e2.links.append(e1.id)
+        archive._save()
+
+        assert archive.bridge_entries() == []
+
+
+class TestRebuildLinks:
+    """Test rebuild_links() full recomputation."""
+
+    def test_empty_archive(self, archive):
+        assert archive.rebuild_links() == 0
+
+    def test_creates_links(self, archive):
+        """Rebuild creates links between similar entries."""
+        archive.add(_make_entry("Alpha dogs canine training", "obedience training"), auto_link=False)
+        archive.add(_make_entry("Beta dogs canine behavior", "behavior training"), auto_link=False)
+        archive.add(_make_entry("Cat food feline nutrition", "fish meals"), auto_link=False)
+
+        total = archive.rebuild_links()
+        assert total > 0
+
+        # Check that dog entries are linked to each other
+        entries = list(archive._entries.values())
+        dog_entries = [e for e in entries if "dog" in e.title.lower()]
+        assert any(len(e.links) > 0 for e in dog_entries)
+
+    def test_override_threshold(self, archive):
+        """Lower threshold creates more links."""
+        archive.add(_make_entry("Alpha dogs", "training"), auto_link=False)
+        archive.add(_make_entry("Beta cats", "training"), auto_link=False)
+        archive.add(_make_entry("Gamma birds", "training"), auto_link=False)
+
+        # Very low threshold = more links
+        low_links = archive.rebuild_links(threshold=0.01)
+
+        # Reset
+        for e in archive._entries.values():
+            e.links = []
+
+        # Higher threshold = fewer links
+        high_links = archive.rebuild_links(threshold=0.9)
+
+        assert low_links >= high_links
+
+    def test_rebuild_persists(self, archive):
+        """Rebuild saves to disk."""
+        archive.add(_make_entry("Alpha dogs", "training"), auto_link=False)
+        archive.add(_make_entry("Beta dogs", "training"), auto_link=False)
+        archive.rebuild_links()
+
+        # Reload and verify links survived
+        archive2 = MnemosyneArchive(archive_path=archive.path)
+        entries = list(archive2._entries.values())
+        total_links = sum(len(e.links) for e in entries)
+        assert total_links > 0