Compare commits

...

3 Commits

Author SHA1 Message Date
Alexander Whitestone
3848b6f4ea test(mnemosyne): graph cluster analysis tests — 22 tests
Some checks failed
CI / test (pull_request) Failing after 10s
CI / validate (pull_request) Failing after 14s
Review Approval Gate / verify-review (pull_request) Failing after 3s
- graph_clusters: empty, orphans, linked pairs, separate clusters, topics, density
- hub_entries: ordering, limit, inbound/outbound counting
- bridge_entries: triangle (none), chain (B is bridge), small cluster filtered
- rebuild_links: creates links, threshold override, persistence
2026-04-11 18:44:58 -04:00
Alexander Whitestone
3ed129ad2b feat(mnemosyne): CLI commands for graph analysis
- mnemosyne clusters: show connected component clusters with density + topics
- mnemosyne hubs: most connected entries by degree centrality
- mnemosyne bridges: articulation points (entries connecting clusters)
- mnemosyne rebuild: recompute all links from scratch
2026-04-11 18:43:14 -04:00
Alexander Whitestone
392c73eb03 feat(mnemosyne): graph cluster analysis — clusters, hubs, bridges, rebuild_links
- graph_clusters(): BFS connected component discovery with density + topic analysis
- hub_entries(): degree centrality ranking of most connected entries
- bridge_entries(): Tarjan's articulation points — entries that connect clusters
- rebuild_links(): full link recomputation after bulk ingestion
- _build_adjacency(): internal adjacency builder with validation
2026-04-11 18:42:32 -04:00
3 changed files with 585 additions and 1 deletions

View File

@@ -241,3 +241,247 @@ class MnemosyneArchive:
"oldest_entry": oldest_entry,
"newest_entry": newest_entry,
}
def _build_adjacency(self) -> dict[str, set[str]]:
"""Build adjacency dict from entry links. Only includes valid references."""
adj: dict[str, set[str]] = {eid: set() for eid in self._entries}
for eid, entry in self._entries.items():
for linked_id in entry.links:
if linked_id in self._entries and linked_id != eid:
adj[eid].add(linked_id)
adj[linked_id].add(eid)
return adj
def graph_clusters(self, min_size: int = 1) -> list[dict]:
"""Find connected component clusters in the holographic graph.
Uses BFS to discover groups of entries that are reachable from each
other through their links. Returns clusters sorted by size descending.
Args:
min_size: Minimum cluster size to include (filters out isolated entries).
Returns:
List of dicts with keys: cluster_id, size, entries, topics, density
"""
adj = self._build_adjacency()
visited: set[str] = set()
clusters: list[dict] = []
cluster_id = 0
for eid in self._entries:
if eid in visited:
continue
# BFS from this entry
component: list[str] = []
queue = [eid]
while queue:
current = queue.pop(0)
if current in visited:
continue
visited.add(current)
component.append(current)
for neighbor in adj.get(current, set()):
if neighbor not in visited:
queue.append(neighbor)
# Single-entry clusters are orphans
if len(component) < min_size:
continue
# Collect topics from cluster entries
cluster_topics: dict[str, int] = {}
internal_edges = 0
for cid in component:
entry = self._entries[cid]
for t in entry.topics:
cluster_topics[t] = cluster_topics.get(t, 0) + 1
internal_edges += len(adj.get(cid, set()))
internal_edges //= 2 # undirected, counted twice
# Density: actual edges / possible edges
n = len(component)
max_edges = n * (n - 1) // 2
density = round(internal_edges / max_edges, 4) if max_edges > 0 else 0.0
# Top topics by frequency
top_topics = sorted(cluster_topics.items(), key=lambda x: x[1], reverse=True)[:5]
clusters.append({
"cluster_id": cluster_id,
"size": n,
"entries": component,
"top_topics": [t for t, _ in top_topics],
"internal_edges": internal_edges,
"density": density,
})
cluster_id += 1
clusters.sort(key=lambda c: c["size"], reverse=True)
return clusters
def hub_entries(self, limit: int = 10) -> list[dict]:
"""Find the most connected entries (highest degree centrality).
These are the "hubs" of the holographic graph — entries that bridge
many topics and attract many links.
Args:
limit: Maximum number of hubs to return.
Returns:
List of dicts with keys: entry, degree, inbound, outbound, topics
"""
adj = self._build_adjacency()
inbound: dict[str, int] = {eid: 0 for eid in self._entries}
for entry in self._entries.values():
for lid in entry.links:
if lid in inbound:
inbound[lid] += 1
hubs = []
for eid, entry in self._entries.items():
degree = len(adj.get(eid, set()))
if degree == 0:
continue
hubs.append({
"entry": entry,
"degree": degree,
"inbound": inbound.get(eid, 0),
"outbound": len(entry.links),
"topics": entry.topics,
})
hubs.sort(key=lambda h: h["degree"], reverse=True)
return hubs[:limit]
def bridge_entries(self) -> list[dict]:
"""Find articulation points — entries whose removal would split a cluster.
These are "bridge" entries in the holographic graph. Removing them
disconnects members that were previously reachable through the bridge.
Uses Tarjan's algorithm for finding articulation points.
Returns:
List of dicts with keys: entry, cluster_size, bridges_between
"""
adj = self._build_adjacency()
# Find clusters first
clusters = self.graph_clusters(min_size=3)
if not clusters:
return []
# For each cluster, run Tarjan's algorithm
bridges: list[dict] = []
for cluster in clusters:
members = set(cluster["entries"])
if len(members) < 3:
continue
# Build subgraph adjacency
sub_adj = {eid: adj[eid] & members for eid in members}
# Tarjan's DFS for articulation points
discovery: dict[str, int] = {}
low: dict[str, int] = {}
parent: dict[str, Optional[str]] = {}
ap: set[str] = set()
timer = [0]
def dfs(u: str):
children = 0
discovery[u] = low[u] = timer[0]
timer[0] += 1
for v in sub_adj[u]:
if v not in discovery:
children += 1
parent[v] = u
dfs(v)
low[u] = min(low[u], low[v])
# u is AP if: root with 2+ children, or non-root with low[v] >= disc[u]
if parent.get(u) is None and children > 1:
ap.add(u)
if parent.get(u) is not None and low[v] >= discovery[u]:
ap.add(u)
elif v != parent.get(u):
low[u] = min(low[u], discovery[v])
for eid in members:
if eid not in discovery:
parent[eid] = None
dfs(eid)
# For each articulation point, estimate what it bridges
for ap_id in ap:
ap_entry = self._entries[ap_id]
# Remove it temporarily and count resulting components
temp_adj = {k: v.copy() for k, v in sub_adj.items()}
del temp_adj[ap_id]
for k in temp_adj:
temp_adj[k].discard(ap_id)
# BFS count components after removal
temp_visited: set[str] = set()
component_count = 0
for mid in members:
if mid == ap_id or mid in temp_visited:
continue
component_count += 1
queue = [mid]
while queue:
cur = queue.pop(0)
if cur in temp_visited:
continue
temp_visited.add(cur)
for nb in temp_adj.get(cur, set()):
if nb not in temp_visited:
queue.append(nb)
if component_count > 1:
bridges.append({
"entry": ap_entry,
"cluster_size": cluster["size"],
"components_after_removal": component_count,
"topics": ap_entry.topics,
})
bridges.sort(key=lambda b: b["components_after_removal"], reverse=True)
return bridges
def rebuild_links(self, threshold: Optional[float] = None) -> int:
"""Recompute all links from scratch.
Clears existing links and re-applies the holographic linker to every
entry pair. Useful after bulk ingestion or threshold changes.
Args:
threshold: Override the linker's default similarity threshold.
Returns:
Total number of links created.
"""
if threshold is not None:
old_threshold = self.linker.threshold
self.linker.threshold = threshold
# Clear all links
for entry in self._entries.values():
entry.links = []
entries = list(self._entries.values())
total_links = 0
# Re-link each entry against all others
for entry in entries:
candidates = [e for e in entries if e.id != entry.id]
new_links = self.linker.apply_links(entry, candidates)
total_links += new_links
if threshold is not None:
self.linker.threshold = old_threshold
self._save()
return total_links

View File

@@ -1,7 +1,8 @@
"""CLI interface for Mnemosyne.
Provides: mnemosyne ingest, mnemosyne search, mnemosyne link, mnemosyne stats,
mnemosyne topics, mnemosyne remove, mnemosyne export
mnemosyne topics, mnemosyne remove, mnemosyne export,
mnemosyne clusters, mnemosyne hubs, mnemosyne bridges, mnemosyne rebuild
"""
from __future__ import annotations
@@ -90,6 +91,58 @@ def cmd_export(args):
print(json.dumps(data, indent=2))
def cmd_clusters(args):
archive = MnemosyneArchive()
clusters = archive.graph_clusters(min_size=args.min_size)
if not clusters:
print("No clusters found.")
return
for c in clusters:
print(f"Cluster {c['cluster_id']}: {c['size']} entries, density={c['density']}")
print(f" Topics: {', '.join(c['top_topics']) if c['top_topics'] else '(none)'}")
if args.verbose:
for eid in c["entries"]:
entry = archive.get(eid)
if entry:
print(f" [{eid[:8]}] {entry.title}")
print()
def cmd_hubs(args):
archive = MnemosyneArchive()
hubs = archive.hub_entries(limit=args.limit)
if not hubs:
print("No hubs found.")
return
for h in hubs:
e = h["entry"]
print(f"[{e.id[:8]}] {e.title}")
print(f" Degree: {h['degree']} (in: {h['inbound']}, out: {h['outbound']})")
print(f" Topics: {', '.join(h['topics']) if h['topics'] else '(none)'}")
print()
def cmd_bridges(args):
archive = MnemosyneArchive()
bridges = archive.bridge_entries()
if not bridges:
print("No bridge entries found.")
return
for b in bridges:
e = b["entry"]
print(f"[{e.id[:8]}] {e.title}")
print(f" Bridges {b['components_after_removal']} components (cluster: {b['cluster_size']} entries)")
print(f" Topics: {', '.join(b['topics']) if b['topics'] else '(none)'}")
print()
def cmd_rebuild(args):
archive = MnemosyneArchive()
threshold = args.threshold if args.threshold else None
total = archive.rebuild_links(threshold=threshold)
print(f"Rebuilt links: {total} connections across {archive.count} entries")
def main():
parser = argparse.ArgumentParser(prog="mnemosyne", description="The Living Holographic Archive")
sub = parser.add_subparsers(dest="command")
@@ -119,6 +172,18 @@ def main():
ex.add_argument("-q", "--query", default="", help="Keyword filter")
ex.add_argument("-t", "--topics", default="", help="Comma-separated topic filter")
cl = sub.add_parser("clusters", help="Show graph clusters (connected components)")
cl.add_argument("-m", "--min-size", type=int, default=1, help="Minimum cluster size")
cl.add_argument("-v", "--verbose", action="store_true", help="List entries in each cluster")
hu = sub.add_parser("hubs", help="Show most connected entries (hub analysis)")
hu.add_argument("-n", "--limit", type=int, default=10, help="Max hubs to show")
sub.add_parser("bridges", help="Show bridge entries (articulation points)")
rb = sub.add_parser("rebuild", help="Recompute all links from scratch")
rb.add_argument("-t", "--threshold", type=float, default=None, help="Similarity threshold override")
args = parser.parse_args()
if not args.command:
parser.print_help()
@@ -132,6 +197,10 @@ def main():
"topics": cmd_topics,
"remove": cmd_remove,
"export": cmd_export,
"clusters": cmd_clusters,
"hubs": cmd_hubs,
"bridges": cmd_bridges,
"rebuild": cmd_rebuild,
}
dispatch[args.command](args)

View File

@@ -0,0 +1,271 @@
"""Tests for Mnemosyne graph cluster analysis features.
Tests: graph_clusters, hub_entries, bridge_entries, rebuild_links.
"""
import pytest
from pathlib import Path
import tempfile
from nexus.mnemosyne.archive import MnemosyneArchive
from nexus.mnemosyne.entry import ArchiveEntry
@pytest.fixture
def archive():
"""Create a fresh archive in a temp directory."""
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "test_archive.json"
a = MnemosyneArchive(archive_path=path)
yield a
def _make_entry(title="Test", content="test content", topics=None):
return ArchiveEntry(title=title, content=content, topics=topics or [])
class TestGraphClusters:
"""Test graph_clusters() connected component discovery."""
def test_empty_archive(self, archive):
clusters = archive.graph_clusters()
assert clusters == []
def test_single_orphan(self, archive):
archive.add(_make_entry("Lone entry"), auto_link=False)
# min_size=1 includes orphans
clusters = archive.graph_clusters(min_size=1)
assert len(clusters) == 1
assert clusters[0]["size"] == 1
assert clusters[0]["density"] == 0.0
def test_single_orphan_filtered(self, archive):
archive.add(_make_entry("Lone entry"), auto_link=False)
clusters = archive.graph_clusters(min_size=2)
assert clusters == []
def test_two_linked_entries(self, archive):
"""Two manually linked entries form a cluster."""
e1 = archive.add(_make_entry("Alpha dogs", "canine training"), auto_link=False)
e2 = archive.add(_make_entry("Beta cats", "feline behavior"), auto_link=False)
# Manual link
e1.links.append(e2.id)
e2.links.append(e1.id)
archive._save()
clusters = archive.graph_clusters(min_size=2)
assert len(clusters) == 1
assert clusters[0]["size"] == 2
assert clusters[0]["internal_edges"] == 1
assert clusters[0]["density"] == 1.0 # 1 edge out of 1 possible
def test_two_separate_clusters(self, archive):
"""Two disconnected groups form separate clusters."""
a1 = archive.add(_make_entry("AI models", "neural networks"), auto_link=False)
a2 = archive.add(_make_entry("AI training", "gradient descent"), auto_link=False)
b1 = archive.add(_make_entry("Cooking pasta", "italian recipes"), auto_link=False)
b2 = archive.add(_make_entry("Cooking sauces", "tomato basil"), auto_link=False)
# Link cluster A
a1.links.append(a2.id)
a2.links.append(a1.id)
# Link cluster B
b1.links.append(b2.id)
b2.links.append(b1.id)
archive._save()
clusters = archive.graph_clusters(min_size=2)
assert len(clusters) == 2
sizes = sorted(c["size"] for c in clusters)
assert sizes == [2, 2]
def test_cluster_topics(self, archive):
"""Cluster includes aggregated topics."""
e1 = archive.add(_make_entry("Alpha", "content", topics=["ai", "models"]), auto_link=False)
e2 = archive.add(_make_entry("Beta", "content", topics=["ai", "training"]), auto_link=False)
e1.links.append(e2.id)
e2.links.append(e1.id)
archive._save()
clusters = archive.graph_clusters(min_size=2)
assert "ai" in clusters[0]["top_topics"]
def test_density_calculation(self, archive):
"""Triangle (3 nodes, 3 edges) has density 1.0."""
e1 = archive.add(_make_entry("A", "aaa"), auto_link=False)
e2 = archive.add(_make_entry("B", "bbb"), auto_link=False)
e3 = archive.add(_make_entry("C", "ccc"), auto_link=False)
# Fully connected triangle
for e, others in [(e1, [e2, e3]), (e2, [e1, e3]), (e3, [e1, e2])]:
for o in others:
e.links.append(o.id)
archive._save()
clusters = archive.graph_clusters(min_size=2)
assert len(clusters) == 1
assert clusters[0]["internal_edges"] == 3
assert clusters[0]["density"] == 1.0 # 3 edges / 3 possible
def test_chain_density(self, archive):
"""A-B-C chain has density 2/3 (2 edges out of 3 possible)."""
e1 = archive.add(_make_entry("A", "aaa"), auto_link=False)
e2 = archive.add(_make_entry("B", "bbb"), auto_link=False)
e3 = archive.add(_make_entry("C", "ccc"), auto_link=False)
# Chain: A-B-C
e1.links.append(e2.id)
e2.links.extend([e1.id, e3.id])
e3.links.append(e2.id)
archive._save()
clusters = archive.graph_clusters(min_size=2)
assert abs(clusters[0]["density"] - 2/3) < 0.01
class TestHubEntries:
"""Test hub_entries() degree centrality ranking."""
def test_empty(self, archive):
assert archive.hub_entries() == []
def test_no_links(self, archive):
archive.add(_make_entry("Lone"), auto_link=False)
assert archive.hub_entries() == []
def test_hub_ordering(self, archive):
"""Entry with most links is ranked first."""
e1 = archive.add(_make_entry("Hub", "central node"), auto_link=False)
e2 = archive.add(_make_entry("Spoke 1", "content"), auto_link=False)
e3 = archive.add(_make_entry("Spoke 2", "content"), auto_link=False)
e4 = archive.add(_make_entry("Spoke 3", "content"), auto_link=False)
# e1 connects to all spokes
e1.links.extend([e2.id, e3.id, e4.id])
e2.links.append(e1.id)
e3.links.append(e1.id)
e4.links.append(e1.id)
archive._save()
hubs = archive.hub_entries()
assert len(hubs) == 4
assert hubs[0]["entry"].id == e1.id
assert hubs[0]["degree"] == 3
def test_limit(self, archive):
e1 = archive.add(_make_entry("A", ""), auto_link=False)
e2 = archive.add(_make_entry("B", ""), auto_link=False)
e1.links.append(e2.id)
e2.links.append(e1.id)
archive._save()
assert len(archive.hub_entries(limit=1)) == 1
def test_inbound_outbound(self, archive):
"""Inbound counts links TO an entry, outbound counts links FROM it."""
e1 = archive.add(_make_entry("Source", ""), auto_link=False)
e2 = archive.add(_make_entry("Target", ""), auto_link=False)
# Only e1 links to e2
e1.links.append(e2.id)
archive._save()
hubs = archive.hub_entries()
h1 = next(h for h in hubs if h["entry"].id == e1.id)
h2 = next(h for h in hubs if h["entry"].id == e2.id)
assert h1["inbound"] == 0
assert h1["outbound"] == 1
assert h2["inbound"] == 1
assert h2["outbound"] == 0
class TestBridgeEntries:
"""Test bridge_entries() articulation point detection."""
def test_empty(self, archive):
assert archive.bridge_entries() == []
def test_no_bridges_in_triangle(self, archive):
"""Fully connected triangle has no articulation points."""
e1 = archive.add(_make_entry("A", ""), auto_link=False)
e2 = archive.add(_make_entry("B", ""), auto_link=False)
e3 = archive.add(_make_entry("C", ""), auto_link=False)
for e, others in [(e1, [e2, e3]), (e2, [e1, e3]), (e3, [e1, e2])]:
for o in others:
e.links.append(o.id)
archive._save()
assert archive.bridge_entries() == []
def test_bridge_in_chain(self, archive):
"""A-B-C chain: B is the articulation point."""
e1 = archive.add(_make_entry("A", ""), auto_link=False)
e2 = archive.add(_make_entry("B", ""), auto_link=False)
e3 = archive.add(_make_entry("C", ""), auto_link=False)
e1.links.append(e2.id)
e2.links.extend([e1.id, e3.id])
e3.links.append(e2.id)
archive._save()
bridges = archive.bridge_entries()
assert len(bridges) == 1
assert bridges[0]["entry"].id == e2.id
assert bridges[0]["components_after_removal"] == 2
def test_no_bridges_in_small_cluster(self, archive):
"""Two-node clusters are too small for bridge detection."""
e1 = archive.add(_make_entry("A", ""), auto_link=False)
e2 = archive.add(_make_entry("B", ""), auto_link=False)
e1.links.append(e2.id)
e2.links.append(e1.id)
archive._save()
assert archive.bridge_entries() == []
class TestRebuildLinks:
"""Test rebuild_links() full recomputation."""
def test_empty_archive(self, archive):
assert archive.rebuild_links() == 0
def test_creates_links(self, archive):
"""Rebuild creates links between similar entries."""
archive.add(_make_entry("Alpha dogs canine training", "obedience training"), auto_link=False)
archive.add(_make_entry("Beta dogs canine behavior", "behavior training"), auto_link=False)
archive.add(_make_entry("Cat food feline nutrition", "fish meals"), auto_link=False)
total = archive.rebuild_links()
assert total > 0
# Check that dog entries are linked to each other
entries = list(archive._entries.values())
dog_entries = [e for e in entries if "dog" in e.title.lower()]
assert any(len(e.links) > 0 for e in dog_entries)
def test_override_threshold(self, archive):
"""Lower threshold creates more links."""
archive.add(_make_entry("Alpha dogs", "training"), auto_link=False)
archive.add(_make_entry("Beta cats", "training"), auto_link=False)
archive.add(_make_entry("Gamma birds", "training"), auto_link=False)
# Very low threshold = more links
low_links = archive.rebuild_links(threshold=0.01)
# Reset
for e in archive._entries.values():
e.links = []
# Higher threshold = fewer links
high_links = archive.rebuild_links(threshold=0.9)
assert low_links >= high_links
def test_rebuild_persists(self, archive):
"""Rebuild saves to disk."""
archive.add(_make_entry("Alpha dogs", "training"), auto_link=False)
archive.add(_make_entry("Beta dogs", "training"), auto_link=False)
archive.rebuild_links()
# Reload and verify links survived
archive2 = MnemosyneArchive(archive_path=archive.path)
entries = list(archive2._entries.values())
total_links = sum(len(e.links) for e in entries)
assert total_links > 0