From 77a753f6f2e96f8851d14f9adaea4eadce07cb11 Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Wed, 15 Apr 2026 03:46:43 +0000 Subject: [PATCH] feat: dead code detector for Python codebases (#94) --- scripts/dead_code_detector.py | 282 ++++++++++++++++++++++++++++++++++ 1 file changed, 282 insertions(+) create mode 100644 scripts/dead_code_detector.py diff --git a/scripts/dead_code_detector.py b/scripts/dead_code_detector.py new file mode 100644 index 0000000..070347a --- /dev/null +++ b/scripts/dead_code_detector.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python3 +""" +Dead Code Detector for Python Codebases + +AST-based analysis to find defined but never-called functions and classes. +Excludes entry points, plugin hooks, __init__ exports. + +Usage: + python3 scripts/dead_code_detector.py /path/to/repo/ + python3 scripts/dead_code_detector.py hermes-agent/ --format json + python3 scripts/dead_code_detector.py . --exclude tests/,venv/ + +Output: file:line, function/class name, last git author (if available) +""" + +import argparse +import ast +import json +import os +import subprocess +import sys +from collections import defaultdict +from pathlib import Path +from typing import Optional + + +# Names that are expected to be unused (entry points, protocol methods, etc.) +SAFE_UNUSED_PATTERNS = { + # Python dunders + "__init__", "__str__", "__repr__", "__eq__", "__hash__", "__len__", + "__getitem__", "__setitem__", "__contains__", "__iter__", "__next__", + "__enter__", "__exit__", "__call__", "__bool__", "__del__", + "__post_init__", "__class_getitem__", + # Common entry points + "main", "app", "handler", "setup", "teardown", "fixture", + # pytest + "conftest", "test_", "pytest_", # prefix patterns + # Protocols / abstract + "abstractmethod", "abc_", +} + + +def is_safe_unused(name: str, filepath: str) -> bool: + """Check if an unused name is expected to be unused.""" + # Test files are exempt + if "test" in filepath.lower(): + return True + + # Known patterns + for pattern in SAFE_UNUSED_PATTERNS: + if name.startswith(pattern) or name == pattern: + return True + + # __init__.py exports are often unused internally + if filepath.endswith("__init__.py"): + return True + + return False + + +def get_git_blame(filepath: str, lineno: int) -> Optional[str]: + """Get last author of a line via git blame.""" + try: + result = subprocess.run( + ["git", "blame", "-L", f"{lineno},{lineno}", "--porcelain", filepath], + capture_output=True, text=True, timeout=5 + ) + for line in result.stdout.split("\n"): + if line.startswith("author "): + return line[7:] + except: + pass + return None + + +class DefinitionCollector(ast.NodeVisitor): + """Collect all function and class definitions.""" + + def __init__(self): + self.definitions = [] # (name, type, lineno, filepath) + + def visit_FunctionDef(self, node): + self.definitions.append((node.name, "function", node.lineno)) + self.generic_visit(node) + + def visit_AsyncFunctionDef(self, node): + self.definitions.append((node.name, "async_function", node.lineno)) + self.generic_visit(node) + + def visit_ClassDef(self, node): + self.definitions.append((node.name, "class", node.lineno)) + self.generic_visit(node) + + +class NameUsageCollector(ast.NodeVisitor): + """Collect all name references (calls, imports, attribute access).""" + + def __init__(self): + self.names = set() + self.calls = set() + self.imports = set() + + def visit_Name(self, node): + self.names.add(node.id) + self.generic_visit(node) + + def visit_Attribute(self, node): + if isinstance(node.value, ast.Name): + self.names.add(node.value.id) + self.generic_visit(node) + + def visit_Call(self, node): + if isinstance(node.func, ast.Name): + self.calls.add(node.func.id) + elif isinstance(node.func, ast.Attribute): + if isinstance(node.func.value, ast.Name): + self.names.add(node.func.value.id) + self.calls.add(node.func.attr) + self.generic_visit(node) + + def visit_Import(self, node): + for alias in node.names: + self.imports.add(alias.asname or alias.name) + self.generic_visit(node) + + def visit_ImportFrom(self, node): + for alias in node.names: + self.imports.add(alias.asname or alias.name) + self.generic_visit(node) + + +def analyze_file(filepath: str) -> dict: + """Analyze a single Python file for dead code.""" + path = Path(filepath) + try: + content = path.read_text() + tree = ast.parse(content, filename=str(filepath)) + except (SyntaxError, UnicodeDecodeError): + return {"error": f"Could not parse {filepath}"} + + # Collect definitions + def_collector = DefinitionCollector() + def_collector.visit(tree) + definitions = def_collector.definitions + + # Collect usage + usage_collector = NameUsageCollector() + usage_collector.visit(tree) + used_names = usage_collector.names | usage_collector.calls | usage_collector.imports + + # Also scan the entire repo for references to this file's definitions + # (this is done at the repo level, not file level) + + dead = [] + for name, def_type, lineno in definitions: + if name.startswith("_") and not name.startswith("__"): + # Private functions — might be used externally, less likely dead + pass + + if name not in used_names: + if not is_safe_unused(name, filepath): + dead.append({ + "name": name, + "type": def_type, + "file": filepath, + "line": lineno, + }) + + return {"definitions": len(definitions), "dead": dead} + + +def scan_repo(repo_path: str, exclude_patterns: list = None) -> dict: + """Scan an entire repo for dead code.""" + path = Path(repo_path) + exclude = exclude_patterns or ["venv", ".venv", "node_modules", "__pycache__", + ".git", "dist", "build", ".tox", "vendor"] + + all_definitions = {} # name -> [{file, line, type}] + all_files = [] + dead_code = [] + + # First pass: collect all definitions across repo + for fpath in path.rglob("*.py"): + parts = fpath.parts + if any(ex in parts for ex in exclude): + continue + if fpath.name.startswith("."): + continue + + try: + content = fpath.read_text(errors="ignore") + tree = ast.parse(content, filename=str(fpath)) + except: + continue + + all_files.append(str(fpath)) + collector = DefinitionCollector() + collector.visit(tree) + + for name, def_type, lineno in collector.definitions: + rel_path = str(fpath.relative_to(path)) + if name not in all_definitions: + all_definitions[name] = [] + all_definitions[name].append({ + "file": rel_path, + "line": lineno, + "type": def_type, + }) + + # Second pass: check each name for usage across entire repo + all_used_names = set() + for fpath_str in all_files: + try: + content = Path(fpath_str).read_text(errors="ignore") + tree = ast.parse(content) + except: + continue + usage = NameUsageCollector() + usage.visit(tree) + all_used_names.update(usage.names) + all_used_names.update(usage.calls) + all_used_names.update(usage.imports) + + # Find dead code + for name, locations in all_definitions.items(): + if name not in all_used_names: + for loc in locations: + if not is_safe_unused(name, loc["file"]): + dead_code.append({ + "name": name, + "type": loc["type"], + "file": loc["file"], + "line": loc["line"], + }) + + return { + "repo": path.name, + "files_scanned": len(all_files), + "total_definitions": sum(len(v) for v in all_definitions.values()), + "dead_code_count": len(dead_code), + "dead_code": sorted(dead_code, key=lambda x: (x["file"], x["line"])), + } + + +def main(): + parser = argparse.ArgumentParser(description="Find dead code in Python codebases") + parser.add_argument("repo", help="Repository path to scan") + parser.add_argument("--format", choices=["text", "json"], default="text") + parser.add_argument("--exclude", help="Comma-separated patterns to exclude") + parser.add_argument("--git-blame", action="store_true", help="Include git blame info") + args = parser.parse_args() + + exclude = args.exclude.split(",") if args.exclude else None + result = scan_repo(args.repo, exclude) + + if args.format == "json": + print(json.dumps(result, indent=2)) + else: + print(f"Dead Code Report: {result['repo']}") + print(f"Files scanned: {result['files_scanned']}") + print(f"Total definitions: {result['total_definitions']}") + print(f"Dead code found: {result['dead_code_count']}") + print() + + if result["dead_code"]: + print(f"{'File':<45} {'Line':>4} {'Type':<10} {'Name'}") + print("-" * 85) + for item in result["dead_code"]: + author = "" + if args.git_blame: + author = get_git_blame( + os.path.join(args.repo, item["file"]), + item["line"] + ) or "" + author = f" ({author})" if author else "" + print(f"{item['file']:<45} {item['line']:>4} {item['type']:<10} {item['name']}{author}") + else: + print("No dead code detected!") + + +if __name__ == "__main__": + main()