feat(mnemosyne): graph cluster analysis — clusters, hubs, bridges, rebuild_links
- graph_clusters(): BFS connected component discovery with density + topic analysis - hub_entries(): degree centrality ranking of most connected entries - bridge_entries(): Tarjan's articulation points — entries that connect clusters - rebuild_links(): full link recomputation after bulk ingestion - _build_adjacency(): internal adjacency builder with validation
This commit is contained in:
@@ -241,3 +241,247 @@ class MnemosyneArchive:
|
||||
"oldest_entry": oldest_entry,
|
||||
"newest_entry": newest_entry,
|
||||
}
|
||||
|
||||
def _build_adjacency(self) -> dict[str, set[str]]:
|
||||
"""Build adjacency dict from entry links. Only includes valid references."""
|
||||
adj: dict[str, set[str]] = {eid: set() for eid in self._entries}
|
||||
for eid, entry in self._entries.items():
|
||||
for linked_id in entry.links:
|
||||
if linked_id in self._entries and linked_id != eid:
|
||||
adj[eid].add(linked_id)
|
||||
adj[linked_id].add(eid)
|
||||
return adj
|
||||
|
||||
def graph_clusters(self, min_size: int = 1) -> list[dict]:
|
||||
"""Find connected component clusters in the holographic graph.
|
||||
|
||||
Uses BFS to discover groups of entries that are reachable from each
|
||||
other through their links. Returns clusters sorted by size descending.
|
||||
|
||||
Args:
|
||||
min_size: Minimum cluster size to include (filters out isolated entries).
|
||||
|
||||
Returns:
|
||||
List of dicts with keys: cluster_id, size, entries, topics, density
|
||||
"""
|
||||
adj = self._build_adjacency()
|
||||
visited: set[str] = set()
|
||||
clusters: list[dict] = []
|
||||
cluster_id = 0
|
||||
|
||||
for eid in self._entries:
|
||||
if eid in visited:
|
||||
continue
|
||||
# BFS from this entry
|
||||
component: list[str] = []
|
||||
queue = [eid]
|
||||
while queue:
|
||||
current = queue.pop(0)
|
||||
if current in visited:
|
||||
continue
|
||||
visited.add(current)
|
||||
component.append(current)
|
||||
for neighbor in adj.get(current, set()):
|
||||
if neighbor not in visited:
|
||||
queue.append(neighbor)
|
||||
|
||||
# Single-entry clusters are orphans
|
||||
if len(component) < min_size:
|
||||
continue
|
||||
|
||||
# Collect topics from cluster entries
|
||||
cluster_topics: dict[str, int] = {}
|
||||
internal_edges = 0
|
||||
for cid in component:
|
||||
entry = self._entries[cid]
|
||||
for t in entry.topics:
|
||||
cluster_topics[t] = cluster_topics.get(t, 0) + 1
|
||||
internal_edges += len(adj.get(cid, set()))
|
||||
internal_edges //= 2 # undirected, counted twice
|
||||
|
||||
# Density: actual edges / possible edges
|
||||
n = len(component)
|
||||
max_edges = n * (n - 1) // 2
|
||||
density = round(internal_edges / max_edges, 4) if max_edges > 0 else 0.0
|
||||
|
||||
# Top topics by frequency
|
||||
top_topics = sorted(cluster_topics.items(), key=lambda x: x[1], reverse=True)[:5]
|
||||
|
||||
clusters.append({
|
||||
"cluster_id": cluster_id,
|
||||
"size": n,
|
||||
"entries": component,
|
||||
"top_topics": [t for t, _ in top_topics],
|
||||
"internal_edges": internal_edges,
|
||||
"density": density,
|
||||
})
|
||||
cluster_id += 1
|
||||
|
||||
clusters.sort(key=lambda c: c["size"], reverse=True)
|
||||
return clusters
|
||||
|
||||
def hub_entries(self, limit: int = 10) -> list[dict]:
|
||||
"""Find the most connected entries (highest degree centrality).
|
||||
|
||||
These are the "hubs" of the holographic graph — entries that bridge
|
||||
many topics and attract many links.
|
||||
|
||||
Args:
|
||||
limit: Maximum number of hubs to return.
|
||||
|
||||
Returns:
|
||||
List of dicts with keys: entry, degree, inbound, outbound, topics
|
||||
"""
|
||||
adj = self._build_adjacency()
|
||||
inbound: dict[str, int] = {eid: 0 for eid in self._entries}
|
||||
|
||||
for entry in self._entries.values():
|
||||
for lid in entry.links:
|
||||
if lid in inbound:
|
||||
inbound[lid] += 1
|
||||
|
||||
hubs = []
|
||||
for eid, entry in self._entries.items():
|
||||
degree = len(adj.get(eid, set()))
|
||||
if degree == 0:
|
||||
continue
|
||||
hubs.append({
|
||||
"entry": entry,
|
||||
"degree": degree,
|
||||
"inbound": inbound.get(eid, 0),
|
||||
"outbound": len(entry.links),
|
||||
"topics": entry.topics,
|
||||
})
|
||||
|
||||
hubs.sort(key=lambda h: h["degree"], reverse=True)
|
||||
return hubs[:limit]
|
||||
|
||||
def bridge_entries(self) -> list[dict]:
|
||||
"""Find articulation points — entries whose removal would split a cluster.
|
||||
|
||||
These are "bridge" entries in the holographic graph. Removing them
|
||||
disconnects members that were previously reachable through the bridge.
|
||||
Uses Tarjan's algorithm for finding articulation points.
|
||||
|
||||
Returns:
|
||||
List of dicts with keys: entry, cluster_size, bridges_between
|
||||
"""
|
||||
adj = self._build_adjacency()
|
||||
|
||||
# Find clusters first
|
||||
clusters = self.graph_clusters(min_size=3)
|
||||
if not clusters:
|
||||
return []
|
||||
|
||||
# For each cluster, run Tarjan's algorithm
|
||||
bridges: list[dict] = []
|
||||
for cluster in clusters:
|
||||
members = set(cluster["entries"])
|
||||
if len(members) < 3:
|
||||
continue
|
||||
|
||||
# Build subgraph adjacency
|
||||
sub_adj = {eid: adj[eid] & members for eid in members}
|
||||
|
||||
# Tarjan's DFS for articulation points
|
||||
discovery: dict[str, int] = {}
|
||||
low: dict[str, int] = {}
|
||||
parent: dict[str, Optional[str]] = {}
|
||||
ap: set[str] = set()
|
||||
timer = [0]
|
||||
|
||||
def dfs(u: str):
|
||||
children = 0
|
||||
discovery[u] = low[u] = timer[0]
|
||||
timer[0] += 1
|
||||
for v in sub_adj[u]:
|
||||
if v not in discovery:
|
||||
children += 1
|
||||
parent[v] = u
|
||||
dfs(v)
|
||||
low[u] = min(low[u], low[v])
|
||||
|
||||
# u is AP if: root with 2+ children, or non-root with low[v] >= disc[u]
|
||||
if parent.get(u) is None and children > 1:
|
||||
ap.add(u)
|
||||
if parent.get(u) is not None and low[v] >= discovery[u]:
|
||||
ap.add(u)
|
||||
elif v != parent.get(u):
|
||||
low[u] = min(low[u], discovery[v])
|
||||
|
||||
for eid in members:
|
||||
if eid not in discovery:
|
||||
parent[eid] = None
|
||||
dfs(eid)
|
||||
|
||||
# For each articulation point, estimate what it bridges
|
||||
for ap_id in ap:
|
||||
ap_entry = self._entries[ap_id]
|
||||
# Remove it temporarily and count resulting components
|
||||
temp_adj = {k: v.copy() for k, v in sub_adj.items()}
|
||||
del temp_adj[ap_id]
|
||||
for k in temp_adj:
|
||||
temp_adj[k].discard(ap_id)
|
||||
|
||||
# BFS count components after removal
|
||||
temp_visited: set[str] = set()
|
||||
component_count = 0
|
||||
for mid in members:
|
||||
if mid == ap_id or mid in temp_visited:
|
||||
continue
|
||||
component_count += 1
|
||||
queue = [mid]
|
||||
while queue:
|
||||
cur = queue.pop(0)
|
||||
if cur in temp_visited:
|
||||
continue
|
||||
temp_visited.add(cur)
|
||||
for nb in temp_adj.get(cur, set()):
|
||||
if nb not in temp_visited:
|
||||
queue.append(nb)
|
||||
|
||||
if component_count > 1:
|
||||
bridges.append({
|
||||
"entry": ap_entry,
|
||||
"cluster_size": cluster["size"],
|
||||
"components_after_removal": component_count,
|
||||
"topics": ap_entry.topics,
|
||||
})
|
||||
|
||||
bridges.sort(key=lambda b: b["components_after_removal"], reverse=True)
|
||||
return bridges
|
||||
|
||||
def rebuild_links(self, threshold: Optional[float] = None) -> int:
|
||||
"""Recompute all links from scratch.
|
||||
|
||||
Clears existing links and re-applies the holographic linker to every
|
||||
entry pair. Useful after bulk ingestion or threshold changes.
|
||||
|
||||
Args:
|
||||
threshold: Override the linker's default similarity threshold.
|
||||
|
||||
Returns:
|
||||
Total number of links created.
|
||||
"""
|
||||
if threshold is not None:
|
||||
old_threshold = self.linker.threshold
|
||||
self.linker.threshold = threshold
|
||||
|
||||
# Clear all links
|
||||
for entry in self._entries.values():
|
||||
entry.links = []
|
||||
|
||||
entries = list(self._entries.values())
|
||||
total_links = 0
|
||||
|
||||
# Re-link each entry against all others
|
||||
for entry in entries:
|
||||
candidates = [e for e in entries if e.id != entry.id]
|
||||
new_links = self.linker.apply_links(entry, candidates)
|
||||
total_links += new_links
|
||||
|
||||
if threshold is not None:
|
||||
self.linker.threshold = old_threshold
|
||||
|
||||
self._save()
|
||||
return total_links
|
||||
|
||||
Reference in New Issue
Block a user