feat: cross-repo dependency graph builder (#93 )

2026-04-15 03:44:12 +00:00
2 changed files with 249 additions and 221 deletions
--- a/scripts/dependency_graph.py
+++ b/scripts/dependency_graph.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""
+Cross-Repo Dependency Graph Builder
+
+Scans repos for import/require/reference patterns and builds a directed
+dependency graph. Detects circular dependencies. Outputs DOT and Mermaid.
+
+Usage:
+  python3 scripts/dependency_graph.py /path/to/repos/
+  python3 scripts/dependency_graph.py --repos repo1,repo2,repo3 --format mermaid
+  python3 scripts/dependency_graph.py --repos-dir /path/to/ --format dot --output deps.dot
+
+Patterns detected:
+  - Python: import X, from X import Y
+  - JavaScript: require("X"), import ... from "X"
+  - Go: import "X"
+  - Ansible: include_role, import_role
+  - Docker/Compose: image: X, depends_on
+  - Config references: repo-name in YAML/TOML/JSON
+"""
+
+import argparse
+import json
+import os
+import re
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+
+# Known repo names for matching
+KNOWN_REPOS = [
+    "hermes-agent", "timmy-config", "timmy-home", "the-nexus", "the-door",
+    "the-beacon", "fleet-ops", "burn-fleet", "timmy-dispatch", "turboquant",
+    "compounding-intelligence", "the-playground", "second-son-of-timmy",
+    "ai-safety-review", "the-echo-pattern", "timmy-academy", "wolf",
+    "the-testament",
+]
+
+
+def normalize_repo_name(name: str) -> str:
+    """Normalize a repo name for comparison."""
+    return name.lower().replace("_", "-").replace(".git", "").strip()
+
+
+def scan_file_for_deps(filepath: str, content: str, own_repo: str) -> set:
+    """Scan a file's content for references to other repos."""
+    deps = set()
+    own_norm = normalize_repo_name(own_repo)
+
+    for repo in KNOWN_REPOS:
+        repo_norm = normalize_repo_name(repo)
+        if repo_norm == own_norm:
+            continue
+
+        # Direct name references
+        patterns = [
+            repo,  # exact name
+            repo.replace("-", "_"),  # underscore variant
+            repo.replace("-", ""),  # no separator
+            f"/{repo}/",  # path reference
+            f'"{repo}"',  # quoted
+            f"'{repo}'",  # single quoted
+            f"Timmy_Foundation/{repo}",  # full Gitea path
+            f"Timmy_Foundation.{repo}",  # Python module path
+        ]
+
+        for pattern in patterns:
+            if pattern in content:
+                deps.add(repo)
+                break
+
+    return deps
+
+
+def scan_repo(repo_path: str, repo_name: str = None) -> dict:
+    """Scan a repo directory for dependencies."""
+    path = Path(repo_path)
+    if not path.is_dir():
+        return {"error": f"Not a directory: {repo_path}"}
+
+    if not repo_name:
+        repo_name = path.name
+
+    deps = set()
+    files_scanned = 0
+    exts = {".py", ".js", ".ts", ".go", ".yaml", ".yml", ".toml", ".json",
+            ".md", ".sh", ".bash", ".Dockerfile", ".tf", ".hcl"}
+
+    for fpath in path.rglob("*"):
+        if not fpath.is_file():
+            continue
+        if fpath.suffix not in exts:
+            continue
+        # Skip common non-source dirs
+        parts = fpath.parts
+        if any(p in (".git", "node_modules", "__pycache__", ".venv", "venv",
+                      "vendor", "dist", "build", ".tox") for p in parts):
+            continue
+
+        try:
+            content = fpath.read_text(errors="ignore")
+        except:
+            continue
+
+        file_deps = scan_file_for_deps(str(fpath), content, repo_name)
+        deps.update(file_deps)
+        files_scanned += 1
+
+    return {
+        "repo": repo_name,
+        "dependencies": sorted(deps),
+        "files_scanned": files_scanned,
+    }
+
+
+def detect_cycles(graph: dict) -> list:
+    """Detect circular dependencies using DFS."""
+    cycles = []
+    visited = set()
+    rec_stack = set()
+
+    def dfs(node, path):
+        visited.add(node)
+        rec_stack.add(node)
+
+        for neighbor in graph.get(node, {}).get("dependencies", []):
+            if neighbor not in visited:
+                result = dfs(neighbor, path + [neighbor])
+                if result:
+                    return result
+            elif neighbor in rec_stack:
+                cycle_start = path.index(neighbor)
+                return path[cycle_start:] + [neighbor]
+
+        rec_stack.remove(node)
+        return None
+
+    for node in graph:
+        if node not in visited:
+            cycle = dfs(node, [node])
+            if cycle:
+                cycles.append(cycle)
+
+    return cycles
+
+
+def to_dot(graph: dict) -> str:
+    """Generate DOT format output."""
+    lines = ["digraph dependencies {"]
+    lines.append("  rankdir=LR;")
+    lines.append("  node [shape=box, style=filled, fillcolor="#1a1a2e", fontcolor="#e6edf3"];")
+    lines.append("  edge [color="#4a4a6a"];")
+    lines.append("")
+
+    for repo, data in sorted(graph.items()):
+        dep_count = len(data.get("dependencies", []))
+        fill = "#2d1b69" if dep_count > 2 else "#16213e"
+        lines.append(f'  "{repo}" [fillcolor="{fill}"];')
+        for dep in data.get("dependencies", []):
+            lines.append(f'  "{repo}" -> "{dep}";')
+
+    lines.append("}")
+    return "\n".join(lines)
+
+
+def to_mermaid(graph: dict) -> str:
+    """Generate Mermaid format output."""
+    lines = ["graph LR"]
+
+    for repo, data in sorted(graph.items()):
+        for dep in data.get("dependencies", []):
+            lines.append(f"    {repo.replace('-','_')} --> {dep.replace('-','_')}")
+
+    # Add node labels
+    lines.append("")
+    for repo in sorted(graph.keys()):
+        lines.append(f"    {repo.replace('-','_')}[{repo}]")
+
+    return "\n".join(lines)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Build cross-repo dependency graph")
+    parser.add_argument("repos_dir", nargs="?", help="Directory containing repos")
+    parser.add_argument("--repos", help="Comma-separated list of repo paths")
+    parser.add_argument("--format", choices=["dot", "mermaid", "json"], default="json")
+    parser.add_argument("--output", "-o", help="Output file (default: stdout)")
+    parser.add_argument("--cycles-only", action="store_true", help="Only report cycles")
+    args = parser.parse_args()
+
+    results = {}
+    repo_paths = []
+
+    if args.repos:
+        repo_paths = [p.strip() for p in args.repos.split(",")]
+    elif args.repos_dir:
+        base = Path(args.repos_dir)
+        repo_paths = [str(p) for p in base.iterdir() if p.is_dir() and not p.name.startswith(".")]
+    else:
+        parser.print_help()
+        sys.exit(1)
+
+    for rpath in repo_paths:
+        name = Path(rpath).name
+        print(f"Scanning {name}...", file=sys.stderr)
+        result = scan_repo(rpath, name)
+        if "error" not in result:
+            results[name] = result
+
+    # Detect cycles
+    cycles = detect_cycles(results)
+
+    if args.cycles_only:
+        if cycles:
+            print("CIRCULAR DEPENDENCIES DETECTED:")
+            for cycle in cycles:
+                print(f"  {' -> '.join(cycle)}")
+            sys.exit(1)
+        else:
+            print("No circular dependencies found.")
+            sys.exit(0)
+
+    # Output
+    output = {}
+    if args.format == "dot":
+        output = to_dot(results)
+    elif args.format == "mermaid":
+        output = to_mermaid(results)
+    else:
+        output = json.dumps({
+            "repos": results,
+            "cycles": cycles,
+            "summary": {
+                "total_repos": len(results),
+                "total_deps": sum(len(r["dependencies"]) for r in results.values()),
+                "cycles_found": len(cycles),
+            }
+        }, indent=2)
+
+    if args.output:
+        Path(args.output).write_text(output)
+        print(f"Written to {args.output}", file=sys.stderr)
+    else:
+        print(output)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/knowledge_staleness_check.py
+++ b/scripts/knowledge_staleness_check.py
@@ -1,221 +0,0 @@
-#!/usr/bin/env python3
-"""
-Knowledge Store Staleness Detector
-
-Checks knowledge entries against their source files to detect staleness.
-An entry is stale when its source file has been modified since extraction.
-
-Usage:
-  python3 scripts/knowledge_staleness_check.py knowledge/index.json
-  python3 scripts/knowledge_staleness_check.py --repo /path/to/repo --index knowledge/index.json
-  python3 scripts/knowledge_staleness_check.py --index knowledge/index.json --fix
-
-Expected index.json format:
-{
-  "version": 1,
-  "facts": [
-    {
-      "fact": "...",
-      "category": "fact|pitfall|pattern|tool-quirk",
-      "repo": "repo-name",
-      "confidence": 0.8,
-      "source_file": "path/to/file.py",
-      "source_hash": "sha256:abcdef...",
-      "extracted_at": "2026-04-13T20:00:00Z"
-    }
-  ]
-}
-"""
-
-import argparse
-import hashlib
-import json
-import sys
-from pathlib import Path
-from typing import Optional
-
-
-def compute_file_hash(filepath: str) -> Optional[str]:
-    """Compute SHA-256 hash of a file. Returns None if file not found."""
-    path = Path(filepath)
-    if not path.exists():
-        return None
-    content = path.read_bytes()
-    return hashlib.sha256(content).hexdigest()[:16]
-
-
-def check_staleness(index_path: str, repo_root: str = None) -> dict:
-    """Check all entries in the knowledge index for staleness."""
-    index = Path(index_path)
-    if not index.exists():
-        return {"error": f"Index not found: {index_path}"}
-
-    data = json.loads(index.read_text())
-    facts = data.get("facts", [])
-
-    if not facts:
-        return {
-            "total": 0,
-            "stale": 0,
-            "fresh": 0,
-            "no_source": 0,
-            "missing_files": 0,
-            "stale_entries": [],
-        }
-
-    # Determine repo root
-    if repo_root:
-        root = Path(repo_root)
-    else:
-        root = index.parent.parent  # knowledge/index.json -> repo root
-
-    results = {
-        "total": len(facts),
-        "stale": 0,
-        "fresh": 0,
-        "no_source": 0,
-        "missing_files": 0,
-        "stale_entries": [],
-    }
-
-    for i, entry in enumerate(facts):
-        source_file = entry.get("source_file")
-        stored_hash = entry.get("source_hash")
-
-        if not source_file:
-            results["no_source"] += 1
-            continue
-
-        if not stored_hash:
-            # Entry has source file but no hash — consider stale
-            results["stale"] += 1
-            results["stale_entries"].append({
-                "index": i,
-                "fact": entry.get("fact", "")[:100],
-                "source_file": source_file,
-                "reason": "no_hash",
-            })
-            continue
-
-        # Compute current hash
-        full_path = root / source_file
-        current_hash = compute_file_hash(str(full_path))
-
-        if current_hash is None:
-            results["missing_files"] += 1
-            results["stale_entries"].append({
-                "index": i,
-                "fact": entry.get("fact", "")[:100],
-                "source_file": source_file,
-                "reason": "file_missing",
-            })
-        elif current_hash != stored_hash:
-            results["stale"] += 1
-            results["stale_entries"].append({
-                "index": i,
-                "fact": entry.get("fact", "")[:100],
-                "source_file": source_file,
-                "stored_hash": stored_hash,
-                "current_hash": current_hash,
-                "reason": "hash_mismatch",
-            })
-        else:
-            results["fresh"] += 1
-
-    return results
-
-
-def add_hashes_to_index(index_path: str, repo_root: str = None) -> dict:
-    """Add source hashes to entries that are missing them."""
-    index = Path(index_path)
-    data = json.loads(index.read_text())
-    facts = data.get("facts", [])
-
-    if repo_root:
-        root = Path(repo_root)
-    else:
-        root = index.parent.parent
-
-    updated = 0
-    skipped = 0
-
-    for entry in facts:
-        source_file = entry.get("source_file")
-        if not source_file or entry.get("source_hash"):
-            skipped += 1
-            continue
-
-        full_path = root / source_file
-        file_hash = compute_file_hash(str(full_path))
-        if file_hash:
-            entry["source_hash"] = file_hash
-            updated += 1
-
-    if updated > 0:
-        index.write_text(json.dumps(data, indent=2) + "\n")
-
-    return {"updated": updated, "skipped": skipped, "total": len(facts)}
-
-
-def report_staleness(results: dict) -> str:
-    """Format staleness check results as a report."""
-    lines = []
-    lines.append("=" * 50)
-    lines.append("KNOWLEDGE STORE STALENESS REPORT")
-    lines.append("=" * 50)
-    lines.append(f"Total entries:  {results['total']}")
-    lines.append(f"Fresh:          {results['fresh']}")
-    lines.append(f"Stale:          {results['stale']}")
-    lines.append(f"No source:      {results['no_source']}")
-    lines.append(f"Missing files:  {results['missing_files']}")
-    lines.append("")
-
-    if results["stale_entries"]:
-        lines.append("STALE ENTRIES:")
-        lines.append("-" * 50)
-        for entry in results["stale_entries"]:
-            lines.append(f"  [{entry['reason']}] {entry['source_file']}")
-            lines.append(f"    {entry['fact']}")
-            if entry.get("stored_hash") and entry.get("current_hash"):
-                lines.append(f"    stored:  {entry['stored_hash']}")
-                lines.append(f"    current: {entry['current_hash']}")
-            lines.append("")
-
-    if results["total"] > 0:
-        staleness_pct = results["stale"] / results["total"] * 100
-        lines.append(f"Staleness rate: {staleness_pct:.1f}%")
-    else:
-        lines.append("No entries to check.")
-
-    return "\n".join(lines)
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Check knowledge store for stale entries")
-    parser.add_argument("--index", default="knowledge/index.json", help="Path to index.json")
-    parser.add_argument("--repo", help="Repository root (default: auto-detect from index path)")
-    parser.add_argument("--fix", action="store_true", help="Add missing hashes to index")
-    parser.add_argument("--json", action="store_true", help="Output JSON instead of report")
-    args = parser.parse_args()
-
-    if args.fix:
-        result = add_hashes_to_index(args.index, args.repo)
-        if args.json:
-            print(json.dumps(result, indent=2))
-        else:
-            print(f"Updated {result['updated']} entries with source hashes.")
-            print(f"Skipped {result['skipped']} (already had hashes or no source file).")
-    else:
-        results = check_staleness(args.index, args.repo)
-        if "error" in results:
-            print(f"Error: {results['error']}", file=sys.stderr)
-            sys.exit(1)
-
-        if args.json:
-            print(json.dumps(results, indent=2))
-        else:
-            print(report_staleness(results))
-
-
-if __name__ == "__main__":
-    main()