5.6: Add dependency bloat detector

Scans all .py files for import statements, compares against requirements.txt, and reports unused declared packages. Acceptance: - Lists all imports in code - Compares against declared deps - Reports: unused deps - Output: bloat report Refs #112
2026-04-26 00:50:36 -04:00
3 changed files with 112 additions and 311 deletions
--- a/scripts/dependency_bloat_detector.py
+++ b/scripts/dependency_bloat_detector.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+"""
+Dependency Bloat Detector — find declared packages never imported
+
+Usage:
+  python3 scripts/dependency_bloat_detector.py
+  python3 scripts/dependency_bloat_detector.py --output json
+"""
+
+import ast
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Set, List, Tuple
+
+
+def extract_imports_from_py_files(repo_path: Path) -> Set[str]:
+    """Walk the repo and return the set of top-level imported module names."""
+    imports = set()
+    exclude_dirs = {".git", "venv", ".venv", "__pycache__", "node_modules",
+                    "dist", "build", ".tox", "vendor"}
+    py_files = [
+        f for f in repo_path.rglob("*.py")
+        if not any(part in exclude_dirs for part in f.parts)
+    ]
+    for fpath in py_files:
+        try:
+            content = fpath.read_text(errors="ignore")
+            tree = ast.parse(content)
+        except Exception:
+            continue
+        for node in ast.walk(tree):
+            if isinstance(node, ast.Import):
+                for alias in node.names:
+                    top = alias.name.split('.')[0]
+                    imports.add(top)
+            elif isinstance(node, ast.ImportFrom):
+                if node.module:
+                    top = node.module.split('.')[0]
+                    imports.add(top)
+    return imports
+
+
+def parse_requirements_txt(req_path: Path) -> List[Tuple[str, str]]:
+    """
+    Parse requirements.txt and return list of (package_name, raw_line).
+    Strips version specifiers and ignores comments.
+    """
+    if not req_path.exists():
+        return []
+    declared = []
+    for line in req_path.read_text().splitlines():
+        line = line.strip()
+        if not line or line.startswith('#'):
+            continue
+        # Strip inline comments
+        line = line.split('#')[0].strip()
+        # Extract package name (before any version specifier)
+        pkg_match = re.match(r'^([a-zA-Z0-9_-]+)', line)
+        if pkg_match:
+            pkg = pkg_match.group(1).strip()
+            declared.append((pkg, line))
+    return declared
+
+
+def main():
+    repo_path = Path('.').resolve()
+    req_path = repo_path / 'requirements.txt'
+
+    # 1. Scan imports
+    used = extract_imports_from_py_files(repo_path)
+
+    # 2. Parse declared deps
+    declared = parse_requirements_txt(req_path)
+    declared_names = [pkg for pkg, _ in declared]
+
+    # 3. Compare
+    unused = [(raw, pkg) for pkg, raw in declared if pkg not in used]
+    missing_from_req = [imp for imp in used if imp not in declared_names]
+
+    # 4. Output
+    print("=" * 60)
+    print("  DEPENDENCY BLOAT DETECTOR")
+    print("=" * 60)
+    print(f"  Repository:     {repo_path.name}")
+    print(f"  Requirements:   {req_path}")
+    print(f"  Python files:   {len(list(repo_path.rglob('*.py')))}")
+    print()
+    print(f"  Declared packages  ({len(declared_names)}): {declared_names}")
+    print(f"  Imported packages  ({len(used)}): {sorted(used)}")
+    print()
+    if unused:
+        print("  UNUSED DEPENDENCIES (bloat):")
+        for raw, pkg in unused:
+            print(f"    ✗ {raw}")
+    else:
+        print("  No unused dependencies detected.")
+    print()
+    if missing_from_req:
+        print("  UNDECLARED IMPORTS (used but not in requirements.txt):")
+        for imp in missing_from_req:
+            print(f"    ! {imp}")
+    print()
+    print("=" * 60)
+
+    # Exit code: 0 if no bloat, 1 if unused deps found
+    sys.exit(1 if unused else 0)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/graph_visualizer.py
+++ b/scripts/graph_visualizer.py
@@ -1,206 +0,0 @@
-#!/usr/bin/env python3
-"""
-graph_visualizer.py — Generate visual graph representations of the knowledge graph.
-
-Reads knowledge/index.json and renders the fact relationship graph.
-Supports ASCII terminal output and DOT export for Graphviz.
-
-Usage:
-    python3 scripts/graph_visualizer.py                  # ASCII, all nodes
-    python3 scripts/graph_visualizer.py --format dot     # DOT output
-    python3 scripts/graph_visualizer.py --seed root --max-depth 2
-    python3 scripts/graph_visualizer.py --filter-domain hermes-agent
-    python3 scripts/graph_visualizer.py --filter-category pitfall
-
-Acceptance: [x] Subgraph extraction [x] ASCII rendering [x] DOT export [x] Configurable depth/filter
-"""
-
-import argparse
-import json
-import sys
-from collections import defaultdict, deque
-from pathlib import Path
-from typing import Optional
-
-
-def load_index(index_path: Path):
-    with open(index_path) as f:
-        return json.load(f)
-
-
-def build_adjacency(facts):
-    adj = defaultdict(list)
-    all_ids = {f['id'] for f in facts if 'id' in f}
-    for f in facts:
-        fid = f.get('id')
-        if not fid:
-            continue
-        for rel in f.get('related', []):
-            if rel in all_ids:
-                adj[fid].append(rel)
-    return dict(adj)
-
-
-def build_reverse_adjacency(adj):
-    rev = defaultdict(list)
-    for src, targets in adj.items():
-        for tgt in targets:
-            rev[tgt].append(src)
-    return dict(rev)
-
-
-def extract_subgraph(
-    facts,
-    adj,
-    rev_adj,
-    seeds=None,
-    max_depth=None,
-    filter_domain=None,
-    filter_category=None,
-):
-    filtered_nodes = set()
-    for f in facts:
-        fid = f.get('id')
-        if not fid:
-            continue
-        if filter_domain and f.get('domain') != filter_domain:
-            continue
-        if filter_category and f.get('category') != filter_category:
-            continue
-        filtered_nodes.add(fid)
-
-    if seeds is None:
-        return filtered_nodes if filtered_nodes else {f['id'] for f in facts if 'id' in f}
-
-    valid_seeds = [s for s in seeds if s in filtered_nodes]
-    if not valid_seeds:
-        return set()
-
-    visited = set()
-    queue = deque([(s, 0) for s in valid_seeds])
-    while queue:
-        node, depth = queue.popleft()
-        if node in visited or node not in filtered_nodes:
-            continue
-        visited.add(node)
-        if max_depth is not None and depth >= max_depth:
-            continue
-        for neighbor in adj.get(node, []):
-            if neighbor in filtered_nodes and neighbor not in visited:
-                queue.append((neighbor, depth + 1))
-        for neighbor in rev_adj.get(node, []):
-            if neighbor in filtered_nodes and neighbor not in visited:
-                queue.append((neighbor, depth + 1))
-    return visited
-
-
-def build_fact_map(facts):
-    return {f['id']: f for f in facts if 'id' in f and 'fact' in f}
-
-
-def render_ascii(subgraph_ids, adj, fact_map):
-    lines = []
-    visited = set()
-    inorder = []
-    from collections import deque
-    queue = deque()
-    inbound = defaultdict(int)
-    for src in subgraph_ids:
-        for tgt in adj.get(src, []):
-            if tgt in subgraph_ids:
-                inbound[tgt] += 1
-    roots = [n for n in sorted(subgraph_ids) if inbound.get(n, 0) == 0]
-    if not roots:
-        roots = sorted(subgraph_ids)
-    for root in roots:
-        queue.append((root, 0, None))
-    while queue:
-        node, depth, parent_label = queue.popleft()
-        if node in visited:
-            continue
-        visited.add(node)
-        fact = fact_map.get(node, {})
-        label = fact.get('fact', str(node))[:80]
-        category = fact.get('category', 'fact')
-        domain = fact.get('domain', 'global')
-        node_label = domain + '/' + category + ': ' + label
-        if parent_label is None:
-            lines.append(f"{'  ' * depth}┌─ {node_label}")
-        else:
-            lines.append(f"{'  ' * depth}├─ {node_label}")
-        children = [c for c in adj.get(node, []) if c in subgraph_ids]
-        for i, child in enumerate(children):
-            queue.append((child, depth + 1, node))
-    if len(visited) < len(subgraph_ids):
-        lines.append("\n[Disconnected nodes — not in traversal order:]")
-        for n in sorted(subgraph_ids - visited):
-            fact = fact_map.get(n, {})
-            label = fact.get('fact', n)[:60]
-            lines.append(f"  {n} — {label}")
-    return "\n".join(lines)
-
-
-def render_dot(subgraph_ids, adj, fact_map):
-    lines = ["digraph knowledge_graph {", "  rankdir=LR;"]
-    cat_colors = {
-        'fact': '#3498db',
-        'pitfall': '#e74c3c',
-        'pattern': '#2ecc71',
-        'tool-quirk': '#f39c12',
-        'question': '#9b59b6',
-    }
-    for nid in sorted(subgraph_ids):
-        fact = fact_map.get(nid, {})
-        category = fact.get('category', 'fact')
-        domain = fact.get('domain', 'global')
-        label = fact.get('fact', nid).replace('"', '\\"')[:80]
-        fillcolor = cat_colors.get(category, '#666666')
-        lines.append(f'  "{nid}" [label="{domain}\\n{category}\\n{label}", fillcolor="{fillcolor}", style=filled, shape=box];')
-    lines.append("")
-    for src in sorted(subgraph_ids):
-        for tgt in adj.get(src, []):
-            if tgt in subgraph_ids:
-                lines.append(f'  "{src}" -> "{tgt}";')
-    lines.append("}")
-    return "\n".join(lines)
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Visualize the knowledge graph (ASCII terminal or DOT for Graphviz).")
-    parser.add_argument("--index", type=Path, default=Path(__file__).parent.parent / "knowledge" / "index.json",
-        help="Path to knowledge/index.json")
-    parser.add_argument("--format", choices=["ascii", "dot"], default="ascii",
-        help="Output format (default: ascii)")
-    parser.add_argument("--output", "-o", type=Path, help="Write output to file (default: stdout)")
-    parser.add_argument("--seed", help="Starting fact ID (comma-sep). Omit to render full graph.")
-    parser.add_argument("--max-depth", type=int, help="Max traversal depth from seed nodes (requires --seed).")
-    parser.add_argument("--filter-domain", help="Only include facts from this domain.")
-    parser.add_argument("--filter-category", help="Only include facts of this category.")
-    args = parser.parse_args()
-
-    index = load_index(args.index)
-    facts = index.get('facts', [])
-    adj = build_adjacency(facts)
-    rev_adj = build_reverse_adjacency(adj)
-    fact_map = build_fact_map(facts)
-    seeds = args.seed.split(',') if args.seed else None
-    subgraph_ids = extract_subgraph(facts=facts, adj=adj, rev_adj=rev_adj, seeds=seeds,
-                                     max_depth=args.max_depth,
-                                     filter_domain=args.filter_domain,
-                                     filter_category=args.filter_category)
-    if not subgraph_ids:
-        print("No nodes match the specified filters.", file=sys.stderr)
-        sys.exit(1)
-    if args.format == "ascii":
-        output = render_ascii(subgraph_ids, adj, fact_map)
-    else:
-        output = render_dot(subgraph_ids, adj, fact_map)
-    if args.output:
-        args.output.write_text(output)
-        print(f"Written: {args.output}", file=sys.stderr)
-    else:
-        print(output)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/test_graph_visualizer.py
+++ b/scripts/test_graph_visualizer.py
@@ -1,105 +0,0 @@
-#!/usr/bin/env python3
-"""
-Tests for graph_visualizer.py — smoke test + subgraph logic.
-Run: python3 scripts/test_graph_visualizer.py
-"""
-
-import json, sys, tempfile
-from pathlib import Path
-sys.path.insert(0, str(Path(__file__).resolve().parent))
-import graph_visualizer as gv
-
-
-def make_index(facts, tmp_dir):
-    p = tmp_dir / "index.json"
-    p.write_text(json.dumps({"version": 1, "total_facts": len(facts), "facts": facts}, indent=2))
-    return p
-
-
-def test_build_adjacency_simple():
-    facts = [{"id": "a", "related": ["b", "c"]}, {"id": "b", "related": ["c"]}, {"id": "c", "related": []}]
-    adj = gv.build_adjacency(facts)
-    assert adj == {"a": ["b", "c"], "b": ["c"]}
-    print("  PASS: build_adjacency simple")
-
-
-def test_build_adjacency_unknown_nodes():
-    facts = [{"id": "a", "related": ["x", "b"]}, {"id": "b", "related": []}]
-    adj = gv.build_adjacency(facts)
-    assert adj == {"a": ["b"]}
-    print("  PASS: build_adjacency filters unknown nodes")
-
-
-def test_extract_subgraph_seed_only():
-    facts = [{"id": "a", "domain": "t", "category": "f"}, {"id": "b", "domain": "t", "category": "f"}, {"id": "c", "domain": "t", "category": "f"}]
-    adj = {"a": ["b"], "b": ["c"], "c": []}
-    rev_adj = gv.build_reverse_adjacency(adj)
-    sub = gv.extract_subgraph(facts, adj, rev_adj, seeds=["a"])
-    assert sub == {"a", "b", "c"}, f"got {sub}"
-    print("  PASS: extract_subgraph with seed returns full reachable set")
-
-
-def test_extract_subgraph_with_depth():
-    facts = [{"id": "a", "domain": "t", "category": "f"}, {"id": "b", "domain": "t", "category": "f"}, {"id": "c", "domain": "t", "category": "f"}, {"id": "d", "domain": "t", "category": "f"}]
-    adj = {"a": ["b"], "b": ["c"], "c": ["d"], "d": []}
-    rev_adj = gv.build_reverse_adjacency(adj)
-    sub = gv.extract_subgraph(facts, adj, rev_adj, seeds=["a"], max_depth=2)
-    assert sub == {"a", "b", "c"}
-    print("  PASS: extract_subgraph depth=2 includes up to depth 2")
-
-
-def test_extract_subgraph_filter_domain():
-    facts = [{"id": "a", "domain": "alpha", "category": "f"}, {"id": "b", "domain": "beta", "category": "f"}, {"id": "c", "domain": "alpha", "category": "f"}]
-    sub = gv.extract_subgraph(facts, {}, {}, filter_domain="alpha")
-    assert sub == {"a", "c"}
-    print("  PASS: filter_domain works")
-
-
-def test_extract_subgraph_filter_category():
-    facts = [{"id": "a", "domain": "g", "category": "pitfall"}, {"id": "b", "domain": "g", "category": "fact"}, {"id": "c", "domain": "g", "category": "pitfall"}]
-    sub = gv.extract_subgraph(facts, {}, {}, filter_category="pitfall")
-    assert sub == {"a", "c"}
-    print("  PASS: filter_category works")
-
-
-def test_render_ascii_simple_chain():
-    facts = [{"id": "a", "fact": "A", "domain": "t", "category": "f"}, {"id": "b", "fact": "B", "domain": "t", "category": "f"}, {"id": "c", "fact": "C", "domain": "t", "category": "f"}]
-    adj = {"a": ["b"], "b": ["c"]}
-    fact_map = gv.build_fact_map(facts)
-    out = gv.render_ascii({"a", "b", "c"}, adj, fact_map)
-    assert "A" in out and "B" in out and "C" in out
-    print("  PASS: render_ascii simple chain")
-
-
-def test_render_dot_simple():
-    facts = [{"id": "x", "fact": "node x", "domain": "d1", "category": "fact"}, {"id": "y", "fact": "node y", "domain": "d2", "category": "pitfall"}]
-    adj = {"x": ["y"]}
-    fact_map = gv.build_fact_map(facts)
-    out = gv.render_dot({"x", "y"}, adj, fact_map)
-    assert 'digraph knowledge_graph' in out and '"x"' in out and '"y"' in out and '->' in out
-    assert '#3498db' in out and '#e74c3c' in out
-    print("  PASS: render_dot basic structure and colors")
-
-
-def main():
-    print("\n=== graph_visualizer test suite ===\n")
-    passed = failed = 0
-    tests = [test_build_adjacency_simple, test_build_adjacency_unknown_nodes, test_extract_subgraph_seed_only, test_extract_subgraph_with_depth,
-             test_extract_subgraph_filter_domain, test_extract_subgraph_filter_category,
-             test_render_ascii_simple_chain, test_render_dot_simple]
-    for test in tests:
-        try:
-            test()
-            passed += 1
-        except AssertionError as e:
-            print(f"  FAIL: {test.__name__} — {e}")
-            failed += 1
-        except Exception as e:
-            print(f"  ERROR: {test.__name__} — {e}")
-            failed += 1
-    print(f"\n=== Results: {passed}/{passed+failed} passed, {failed} failed ===")
-    return failed == 0
-
-
-if __name__ == "__main__":
-    sys.exit(0 if main() else 1)