feat(mnemosyne): graph cluster analysis — clusters, hubs, bridges, rebuild_links

- graph_clusters(): BFS connected component discovery with density + topic analysis
- hub_entries(): degree centrality ranking of most connected entries
- bridge_entries(): Tarjan's articulation points — entries that connect clusters
- rebuild_links(): full link recomputation after bulk ingestion
- _build_adjacency(): internal adjacency builder with validation
This commit is contained in:
Alexander Whitestone
2026-04-11 18:42:32 -04:00
parent ed5ed011c2
commit 392c73eb03

View File

@@ -241,3 +241,247 @@ class MnemosyneArchive:
"oldest_entry": oldest_entry,
"newest_entry": newest_entry,
}
def _build_adjacency(self) -> dict[str, set[str]]:
"""Build adjacency dict from entry links. Only includes valid references."""
adj: dict[str, set[str]] = {eid: set() for eid in self._entries}
for eid, entry in self._entries.items():
for linked_id in entry.links:
if linked_id in self._entries and linked_id != eid:
adj[eid].add(linked_id)
adj[linked_id].add(eid)
return adj
def graph_clusters(self, min_size: int = 1) -> list[dict]:
"""Find connected component clusters in the holographic graph.
Uses BFS to discover groups of entries that are reachable from each
other through their links. Returns clusters sorted by size descending.
Args:
min_size: Minimum cluster size to include (filters out isolated entries).
Returns:
List of dicts with keys: cluster_id, size, entries, topics, density
"""
adj = self._build_adjacency()
visited: set[str] = set()
clusters: list[dict] = []
cluster_id = 0
for eid in self._entries:
if eid in visited:
continue
# BFS from this entry
component: list[str] = []
queue = [eid]
while queue:
current = queue.pop(0)
if current in visited:
continue
visited.add(current)
component.append(current)
for neighbor in adj.get(current, set()):
if neighbor not in visited:
queue.append(neighbor)
# Single-entry clusters are orphans
if len(component) < min_size:
continue
# Collect topics from cluster entries
cluster_topics: dict[str, int] = {}
internal_edges = 0
for cid in component:
entry = self._entries[cid]
for t in entry.topics:
cluster_topics[t] = cluster_topics.get(t, 0) + 1
internal_edges += len(adj.get(cid, set()))
internal_edges //= 2 # undirected, counted twice
# Density: actual edges / possible edges
n = len(component)
max_edges = n * (n - 1) // 2
density = round(internal_edges / max_edges, 4) if max_edges > 0 else 0.0
# Top topics by frequency
top_topics = sorted(cluster_topics.items(), key=lambda x: x[1], reverse=True)[:5]
clusters.append({
"cluster_id": cluster_id,
"size": n,
"entries": component,
"top_topics": [t for t, _ in top_topics],
"internal_edges": internal_edges,
"density": density,
})
cluster_id += 1
clusters.sort(key=lambda c: c["size"], reverse=True)
return clusters
def hub_entries(self, limit: int = 10) -> list[dict]:
"""Find the most connected entries (highest degree centrality).
These are the "hubs" of the holographic graph — entries that bridge
many topics and attract many links.
Args:
limit: Maximum number of hubs to return.
Returns:
List of dicts with keys: entry, degree, inbound, outbound, topics
"""
adj = self._build_adjacency()
inbound: dict[str, int] = {eid: 0 for eid in self._entries}
for entry in self._entries.values():
for lid in entry.links:
if lid in inbound:
inbound[lid] += 1
hubs = []
for eid, entry in self._entries.items():
degree = len(adj.get(eid, set()))
if degree == 0:
continue
hubs.append({
"entry": entry,
"degree": degree,
"inbound": inbound.get(eid, 0),
"outbound": len(entry.links),
"topics": entry.topics,
})
hubs.sort(key=lambda h: h["degree"], reverse=True)
return hubs[:limit]
def bridge_entries(self) -> list[dict]:
"""Find articulation points — entries whose removal would split a cluster.
These are "bridge" entries in the holographic graph. Removing them
disconnects members that were previously reachable through the bridge.
Uses Tarjan's algorithm for finding articulation points.
Returns:
List of dicts with keys: entry, cluster_size, bridges_between
"""
adj = self._build_adjacency()
# Find clusters first
clusters = self.graph_clusters(min_size=3)
if not clusters:
return []
# For each cluster, run Tarjan's algorithm
bridges: list[dict] = []
for cluster in clusters:
members = set(cluster["entries"])
if len(members) < 3:
continue
# Build subgraph adjacency
sub_adj = {eid: adj[eid] & members for eid in members}
# Tarjan's DFS for articulation points
discovery: dict[str, int] = {}
low: dict[str, int] = {}
parent: dict[str, Optional[str]] = {}
ap: set[str] = set()
timer = [0]
def dfs(u: str):
children = 0
discovery[u] = low[u] = timer[0]
timer[0] += 1
for v in sub_adj[u]:
if v not in discovery:
children += 1
parent[v] = u
dfs(v)
low[u] = min(low[u], low[v])
# u is AP if: root with 2+ children, or non-root with low[v] >= disc[u]
if parent.get(u) is None and children > 1:
ap.add(u)
if parent.get(u) is not None and low[v] >= discovery[u]:
ap.add(u)
elif v != parent.get(u):
low[u] = min(low[u], discovery[v])
for eid in members:
if eid not in discovery:
parent[eid] = None
dfs(eid)
# For each articulation point, estimate what it bridges
for ap_id in ap:
ap_entry = self._entries[ap_id]
# Remove it temporarily and count resulting components
temp_adj = {k: v.copy() for k, v in sub_adj.items()}
del temp_adj[ap_id]
for k in temp_adj:
temp_adj[k].discard(ap_id)
# BFS count components after removal
temp_visited: set[str] = set()
component_count = 0
for mid in members:
if mid == ap_id or mid in temp_visited:
continue
component_count += 1
queue = [mid]
while queue:
cur = queue.pop(0)
if cur in temp_visited:
continue
temp_visited.add(cur)
for nb in temp_adj.get(cur, set()):
if nb not in temp_visited:
queue.append(nb)
if component_count > 1:
bridges.append({
"entry": ap_entry,
"cluster_size": cluster["size"],
"components_after_removal": component_count,
"topics": ap_entry.topics,
})
bridges.sort(key=lambda b: b["components_after_removal"], reverse=True)
return bridges
def rebuild_links(self, threshold: Optional[float] = None) -> int:
"""Recompute all links from scratch.
Clears existing links and re-applies the holographic linker to every
entry pair. Useful after bulk ingestion or threshold changes.
Args:
threshold: Override the linker's default similarity threshold.
Returns:
Total number of links created.
"""
if threshold is not None:
old_threshold = self.linker.threshold
self.linker.threshold = threshold
# Clear all links
for entry in self._entries.values():
entry.links = []
entries = list(self._entries.values())
total_links = 0
# Re-link each entry against all others
for entry in entries:
candidates = [e for e in entries if e.id != entry.id]
new_links = self.linker.apply_links(entry, candidates)
total_links += new_links
if threshold is not None:
self.linker.threshold = old_threshold
self._save()
return total_links