feat: cross-repo dependency graph builder (#93 )

2026-04-15 03:44:12 +00:00
2 changed files with 249 additions and 282 deletions
--- a/scripts/dead_code_detector.py
+++ b/scripts/dead_code_detector.py
@@ -1,282 +0,0 @@
 #!/usr/bin/env python3
 """
 Dead Code Detector for Python Codebases
 AST-based analysis to find defined but never-called functions and classes.
 Excludes entry points, plugin hooks, __init__ exports.
 Usage:
  python3 scripts/dead_code_detector.py /path/to/repo/
  python3 scripts/dead_code_detector.py hermes-agent/ --format json
  python3 scripts/dead_code_detector.py . --exclude tests/,venv/
 Output: file:line, function/class name, last git author (if available)
 """
 import argparse
 import ast
 import json
 import os
 import subprocess
 import sys
 from collections import defaultdict
 from pathlib import Path
 from typing import Optional
 # Names that are expected to be unused (entry points, protocol methods, etc.)
 SAFE_UNUSED_PATTERNS = {
    # Python dunders
    "__init__", "__str__", "__repr__", "__eq__", "__hash__", "__len__",
    "__getitem__", "__setitem__", "__contains__", "__iter__", "__next__",
    "__enter__", "__exit__", "__call__", "__bool__", "__del__",
    "__post_init__", "__class_getitem__",
    # Common entry points
    "main", "app", "handler", "setup", "teardown", "fixture",
    # pytest
    "conftest", "test_", "pytest_",  # prefix patterns
    # Protocols / abstract
    "abstractmethod", "abc_",
 }
 def is_safe_unused(name: str, filepath: str) -> bool:
    """Check if an unused name is expected to be unused."""
    # Test files are exempt
    if "test" in filepath.lower():
        return True
    # Known patterns
    for pattern in SAFE_UNUSED_PATTERNS:
        if name.startswith(pattern) or name == pattern:
            return True
    # __init__.py exports are often unused internally
    if filepath.endswith("__init__.py"):
        return True
    return False
 def get_git_blame(filepath: str, lineno: int) -> Optional[str]:
    """Get last author of a line via git blame."""
    try:
        result = subprocess.run(
            ["git", "blame", "-L", f"{lineno},{lineno}", "--porcelain", filepath],
            capture_output=True, text=True, timeout=5
        )
        for line in result.stdout.split("\n"):
            if line.startswith("author "):
                return line[7:]
    except:
        pass
    return None
 class DefinitionCollector(ast.NodeVisitor):
    """Collect all function and class definitions."""
    def __init__(self):
        self.definitions = []  # (name, type, lineno, filepath)
    def visit_FunctionDef(self, node):
        self.definitions.append((node.name, "function", node.lineno))
        self.generic_visit(node)
    def visit_AsyncFunctionDef(self, node):
        self.definitions.append((node.name, "async_function", node.lineno))
        self.generic_visit(node)
    def visit_ClassDef(self, node):
        self.definitions.append((node.name, "class", node.lineno))
        self.generic_visit(node)
 class NameUsageCollector(ast.NodeVisitor):
    """Collect all name references (calls, imports, attribute access)."""
    def __init__(self):
        self.names = set()
        self.calls = set()
        self.imports = set()
    def visit_Name(self, node):
        self.names.add(node.id)
        self.generic_visit(node)
    def visit_Attribute(self, node):
        if isinstance(node.value, ast.Name):
            self.names.add(node.value.id)
        self.generic_visit(node)
    def visit_Call(self, node):
        if isinstance(node.func, ast.Name):
            self.calls.add(node.func.id)
        elif isinstance(node.func, ast.Attribute):
            if isinstance(node.func.value, ast.Name):
                self.names.add(node.func.value.id)
            self.calls.add(node.func.attr)
        self.generic_visit(node)
    def visit_Import(self, node):
        for alias in node.names:
            self.imports.add(alias.asname or alias.name)
        self.generic_visit(node)
    def visit_ImportFrom(self, node):
        for alias in node.names:
            self.imports.add(alias.asname or alias.name)
        self.generic_visit(node)
 def analyze_file(filepath: str) -> dict:
    """Analyze a single Python file for dead code."""
    path = Path(filepath)
    try:
        content = path.read_text()
        tree = ast.parse(content, filename=str(filepath))
    except (SyntaxError, UnicodeDecodeError):
        return {"error": f"Could not parse {filepath}"}
    # Collect definitions
    def_collector = DefinitionCollector()
    def_collector.visit(tree)
    definitions = def_collector.definitions
    # Collect usage
    usage_collector = NameUsageCollector()
    usage_collector.visit(tree)
    used_names = usage_collector.names | usage_collector.calls | usage_collector.imports
    # Also scan the entire repo for references to this file's definitions
    # (this is done at the repo level, not file level)
    dead = []
    for name, def_type, lineno in definitions:
        if name.startswith("_") and not name.startswith("__"):
            # Private functions — might be used externally, less likely dead
            pass
        if name not in used_names:
            if not is_safe_unused(name, filepath):
                dead.append({
                    "name": name,
                    "type": def_type,
                    "file": filepath,
                    "line": lineno,
                })
    return {"definitions": len(definitions), "dead": dead}
 def scan_repo(repo_path: str, exclude_patterns: list = None) -> dict:
    """Scan an entire repo for dead code."""
    path = Path(repo_path)
    exclude = exclude_patterns or ["venv", ".venv", "node_modules", "__pycache__",
                                    ".git", "dist", "build", ".tox", "vendor"]
    all_definitions = {}  # name -> [{file, line, type}]
    all_files = []
    dead_code = []
    # First pass: collect all definitions across repo
    for fpath in path.rglob("*.py"):
        parts = fpath.parts
        if any(ex in parts for ex in exclude):
            continue
        if fpath.name.startswith("."):
            continue
        try:
            content = fpath.read_text(errors="ignore")
            tree = ast.parse(content, filename=str(fpath))
        except:
            continue
        all_files.append(str(fpath))
        collector = DefinitionCollector()
        collector.visit(tree)
        for name, def_type, lineno in collector.definitions:
            rel_path = str(fpath.relative_to(path))
            if name not in all_definitions:
                all_definitions[name] = []
            all_definitions[name].append({
                "file": rel_path,
                "line": lineno,
                "type": def_type,
            })
    # Second pass: check each name for usage across entire repo
    all_used_names = set()
    for fpath_str in all_files:
        try:
            content = Path(fpath_str).read_text(errors="ignore")
            tree = ast.parse(content)
        except:
            continue
        usage = NameUsageCollector()
        usage.visit(tree)
        all_used_names.update(usage.names)
        all_used_names.update(usage.calls)
        all_used_names.update(usage.imports)
    # Find dead code
    for name, locations in all_definitions.items():
        if name not in all_used_names:
            for loc in locations:
                if not is_safe_unused(name, loc["file"]):
                    dead_code.append({
                        "name": name,
                        "type": loc["type"],
                        "file": loc["file"],
                        "line": loc["line"],
                    })
    return {
        "repo": path.name,
        "files_scanned": len(all_files),
        "total_definitions": sum(len(v) for v in all_definitions.values()),
        "dead_code_count": len(dead_code),
        "dead_code": sorted(dead_code, key=lambda x: (x["file"], x["line"])),
    }
 def main():
    parser = argparse.ArgumentParser(description="Find dead code in Python codebases")
    parser.add_argument("repo", help="Repository path to scan")
    parser.add_argument("--format", choices=["text", "json"], default="text")
    parser.add_argument("--exclude", help="Comma-separated patterns to exclude")
    parser.add_argument("--git-blame", action="store_true", help="Include git blame info")
    args = parser.parse_args()
    exclude = args.exclude.split(",") if args.exclude else None
    result = scan_repo(args.repo, exclude)
    if args.format == "json":
        print(json.dumps(result, indent=2))
    else:
        print(f"Dead Code Report: {result['repo']}")
        print(f"Files scanned: {result['files_scanned']}")
        print(f"Total definitions: {result['total_definitions']}")
        print(f"Dead code found: {result['dead_code_count']}")
        print()
        if result["dead_code"]:
            print(f"{'File':<45} {'Line':>4} {'Type':<10} {'Name'}")
            print("-" * 85)
            for item in result["dead_code"]:
                author = ""
                if args.git_blame:
                    author = get_git_blame(
                        os.path.join(args.repo, item["file"]),
                        item["line"]
                    ) or ""
                    author = f" ({author})" if author else ""
                print(f"{item['file']:<45} {item['line']:>4} {item['type']:<10} {item['name']}{author}")
        else:
            print("No dead code detected!")
 if __name__ == "__main__":
    main()
--- a/scripts/dependency_graph.py
+++ b/scripts/dependency_graph.py
@@ -0,0 +1,249 @@
 #!/usr/bin/env python3
 """
 Cross-Repo Dependency Graph Builder
 Scans repos for import/require/reference patterns and builds a directed
 dependency graph. Detects circular dependencies. Outputs DOT and Mermaid.
 Usage:
  python3 scripts/dependency_graph.py /path/to/repos/
  python3 scripts/dependency_graph.py --repos repo1,repo2,repo3 --format mermaid
  python3 scripts/dependency_graph.py --repos-dir /path/to/ --format dot --output deps.dot
 Patterns detected:
  - Python: import X, from X import Y
  - JavaScript: require("X"), import ... from "X"
  - Go: import "X"
  - Ansible: include_role, import_role
  - Docker/Compose: image: X, depends_on
  - Config references: repo-name in YAML/TOML/JSON
 """
 import argparse
 import json
 import os
 import re
 import sys
 from collections import defaultdict
 from pathlib import Path
 # Known repo names for matching
 KNOWN_REPOS = [
    "hermes-agent", "timmy-config", "timmy-home", "the-nexus", "the-door",
    "the-beacon", "fleet-ops", "burn-fleet", "timmy-dispatch", "turboquant",
    "compounding-intelligence", "the-playground", "second-son-of-timmy",
    "ai-safety-review", "the-echo-pattern", "timmy-academy", "wolf",
    "the-testament",
 ]
 def normalize_repo_name(name: str) -> str:
    """Normalize a repo name for comparison."""
    return name.lower().replace("_", "-").replace(".git", "").strip()
 def scan_file_for_deps(filepath: str, content: str, own_repo: str) -> set:
    """Scan a file's content for references to other repos."""
    deps = set()
    own_norm = normalize_repo_name(own_repo)
    for repo in KNOWN_REPOS:
        repo_norm = normalize_repo_name(repo)
        if repo_norm == own_norm:
            continue
        # Direct name references
        patterns = [
            repo,  # exact name
            repo.replace("-", "_"),  # underscore variant
            repo.replace("-", ""),  # no separator
            f"/{repo}/",  # path reference
            f'"{repo}"',  # quoted
            f"'{repo}'",  # single quoted
            f"Timmy_Foundation/{repo}",  # full Gitea path
            f"Timmy_Foundation.{repo}",  # Python module path
        ]
        for pattern in patterns:
            if pattern in content:
                deps.add(repo)
                break
    return deps
 def scan_repo(repo_path: str, repo_name: str = None) -> dict:
    """Scan a repo directory for dependencies."""
    path = Path(repo_path)
    if not path.is_dir():
        return {"error": f"Not a directory: {repo_path}"}
    if not repo_name:
        repo_name = path.name
    deps = set()
    files_scanned = 0
    exts = {".py", ".js", ".ts", ".go", ".yaml", ".yml", ".toml", ".json",
            ".md", ".sh", ".bash", ".Dockerfile", ".tf", ".hcl"}
    for fpath in path.rglob("*"):
        if not fpath.is_file():
            continue
        if fpath.suffix not in exts:
            continue
        # Skip common non-source dirs
        parts = fpath.parts
        if any(p in (".git", "node_modules", "__pycache__", ".venv", "venv",
                      "vendor", "dist", "build", ".tox") for p in parts):
            continue
        try:
            content = fpath.read_text(errors="ignore")
        except:
            continue
        file_deps = scan_file_for_deps(str(fpath), content, repo_name)
        deps.update(file_deps)
        files_scanned += 1
    return {
        "repo": repo_name,
        "dependencies": sorted(deps),
        "files_scanned": files_scanned,
    }
 def detect_cycles(graph: dict) -> list:
    """Detect circular dependencies using DFS."""
    cycles = []
    visited = set()
    rec_stack = set()
    def dfs(node, path):
        visited.add(node)
        rec_stack.add(node)
        for neighbor in graph.get(node, {}).get("dependencies", []):
            if neighbor not in visited:
                result = dfs(neighbor, path + [neighbor])
                if result:
                    return result
            elif neighbor in rec_stack:
                cycle_start = path.index(neighbor)
                return path[cycle_start:] + [neighbor]
        rec_stack.remove(node)
        return None
    for node in graph:
        if node not in visited:
            cycle = dfs(node, [node])
            if cycle:
                cycles.append(cycle)
    return cycles
 def to_dot(graph: dict) -> str:
    """Generate DOT format output."""
    lines = ["digraph dependencies {"]
    lines.append("  rankdir=LR;")
    lines.append("  node [shape=box, style=filled, fillcolor="#1a1a2e", fontcolor="#e6edf3"];")
    lines.append("  edge [color="#4a4a6a"];")
    lines.append("")
    for repo, data in sorted(graph.items()):
        dep_count = len(data.get("dependencies", []))
        fill = "#2d1b69" if dep_count > 2 else "#16213e"
        lines.append(f'  "{repo}" [fillcolor="{fill}"];')
        for dep in data.get("dependencies", []):
            lines.append(f'  "{repo}" -> "{dep}";')
    lines.append("}")
    return "\n".join(lines)
 def to_mermaid(graph: dict) -> str:
    """Generate Mermaid format output."""
    lines = ["graph LR"]
    for repo, data in sorted(graph.items()):
        for dep in data.get("dependencies", []):
            lines.append(f"    {repo.replace('-','_')} --> {dep.replace('-','_')}")
    # Add node labels
    lines.append("")
    for repo in sorted(graph.keys()):
        lines.append(f"    {repo.replace('-','_')}[{repo}]")
    return "\n".join(lines)
 def main():
    parser = argparse.ArgumentParser(description="Build cross-repo dependency graph")
    parser.add_argument("repos_dir", nargs="?", help="Directory containing repos")
    parser.add_argument("--repos", help="Comma-separated list of repo paths")
    parser.add_argument("--format", choices=["dot", "mermaid", "json"], default="json")
    parser.add_argument("--output", "-o", help="Output file (default: stdout)")
    parser.add_argument("--cycles-only", action="store_true", help="Only report cycles")
    args = parser.parse_args()
    results = {}
    repo_paths = []
    if args.repos:
        repo_paths = [p.strip() for p in args.repos.split(",")]
    elif args.repos_dir:
        base = Path(args.repos_dir)
        repo_paths = [str(p) for p in base.iterdir() if p.is_dir() and not p.name.startswith(".")]
    else:
        parser.print_help()
        sys.exit(1)
    for rpath in repo_paths:
        name = Path(rpath).name
        print(f"Scanning {name}...", file=sys.stderr)
        result = scan_repo(rpath, name)
        if "error" not in result:
            results[name] = result
    # Detect cycles
    cycles = detect_cycles(results)
    if args.cycles_only:
        if cycles:
            print("CIRCULAR DEPENDENCIES DETECTED:")
            for cycle in cycles:
                print(f"  {' -> '.join(cycle)}")
            sys.exit(1)
        else:
            print("No circular dependencies found.")
            sys.exit(0)
    # Output
    output = {}
    if args.format == "dot":
        output = to_dot(results)
    elif args.format == "mermaid":
        output = to_mermaid(results)
    else:
        output = json.dumps({
            "repos": results,
            "cycles": cycles,
            "summary": {
                "total_repos": len(results),
                "total_deps": sum(len(r["dependencies"]) for r in results.values()),
                "cycles_found": len(cycles),
            }
        }, indent=2)
    if args.output:
        Path(args.output).write_text(output)
        print(f"Written to {args.output}", file=sys.stderr)
    else:
        print(output)
 if __name__ == "__main__":
    main()