Compare commits

..

1 Commits

Author SHA1 Message Date
77a753f6f2 feat: dead code detector for Python codebases (#94) 2026-04-15 03:46:43 +00:00
2 changed files with 282 additions and 221 deletions

View File

@@ -0,0 +1,282 @@
#!/usr/bin/env python3
"""
Dead Code Detector for Python Codebases
AST-based analysis to find defined but never-called functions and classes.
Excludes entry points, plugin hooks, __init__ exports.
Usage:
python3 scripts/dead_code_detector.py /path/to/repo/
python3 scripts/dead_code_detector.py hermes-agent/ --format json
python3 scripts/dead_code_detector.py . --exclude tests/,venv/
Output: file:line, function/class name, last git author (if available)
"""
import argparse
import ast
import json
import os
import subprocess
import sys
from collections import defaultdict
from pathlib import Path
from typing import Optional
# Names that are expected to be unused (entry points, protocol methods, etc.)
SAFE_UNUSED_PATTERNS = {
# Python dunders
"__init__", "__str__", "__repr__", "__eq__", "__hash__", "__len__",
"__getitem__", "__setitem__", "__contains__", "__iter__", "__next__",
"__enter__", "__exit__", "__call__", "__bool__", "__del__",
"__post_init__", "__class_getitem__",
# Common entry points
"main", "app", "handler", "setup", "teardown", "fixture",
# pytest
"conftest", "test_", "pytest_", # prefix patterns
# Protocols / abstract
"abstractmethod", "abc_",
}
def is_safe_unused(name: str, filepath: str) -> bool:
"""Check if an unused name is expected to be unused."""
# Test files are exempt
if "test" in filepath.lower():
return True
# Known patterns
for pattern in SAFE_UNUSED_PATTERNS:
if name.startswith(pattern) or name == pattern:
return True
# __init__.py exports are often unused internally
if filepath.endswith("__init__.py"):
return True
return False
def get_git_blame(filepath: str, lineno: int) -> Optional[str]:
"""Get last author of a line via git blame."""
try:
result = subprocess.run(
["git", "blame", "-L", f"{lineno},{lineno}", "--porcelain", filepath],
capture_output=True, text=True, timeout=5
)
for line in result.stdout.split("\n"):
if line.startswith("author "):
return line[7:]
except:
pass
return None
class DefinitionCollector(ast.NodeVisitor):
"""Collect all function and class definitions."""
def __init__(self):
self.definitions = [] # (name, type, lineno, filepath)
def visit_FunctionDef(self, node):
self.definitions.append((node.name, "function", node.lineno))
self.generic_visit(node)
def visit_AsyncFunctionDef(self, node):
self.definitions.append((node.name, "async_function", node.lineno))
self.generic_visit(node)
def visit_ClassDef(self, node):
self.definitions.append((node.name, "class", node.lineno))
self.generic_visit(node)
class NameUsageCollector(ast.NodeVisitor):
"""Collect all name references (calls, imports, attribute access)."""
def __init__(self):
self.names = set()
self.calls = set()
self.imports = set()
def visit_Name(self, node):
self.names.add(node.id)
self.generic_visit(node)
def visit_Attribute(self, node):
if isinstance(node.value, ast.Name):
self.names.add(node.value.id)
self.generic_visit(node)
def visit_Call(self, node):
if isinstance(node.func, ast.Name):
self.calls.add(node.func.id)
elif isinstance(node.func, ast.Attribute):
if isinstance(node.func.value, ast.Name):
self.names.add(node.func.value.id)
self.calls.add(node.func.attr)
self.generic_visit(node)
def visit_Import(self, node):
for alias in node.names:
self.imports.add(alias.asname or alias.name)
self.generic_visit(node)
def visit_ImportFrom(self, node):
for alias in node.names:
self.imports.add(alias.asname or alias.name)
self.generic_visit(node)
def analyze_file(filepath: str) -> dict:
"""Analyze a single Python file for dead code."""
path = Path(filepath)
try:
content = path.read_text()
tree = ast.parse(content, filename=str(filepath))
except (SyntaxError, UnicodeDecodeError):
return {"error": f"Could not parse {filepath}"}
# Collect definitions
def_collector = DefinitionCollector()
def_collector.visit(tree)
definitions = def_collector.definitions
# Collect usage
usage_collector = NameUsageCollector()
usage_collector.visit(tree)
used_names = usage_collector.names | usage_collector.calls | usage_collector.imports
# Also scan the entire repo for references to this file's definitions
# (this is done at the repo level, not file level)
dead = []
for name, def_type, lineno in definitions:
if name.startswith("_") and not name.startswith("__"):
# Private functions — might be used externally, less likely dead
pass
if name not in used_names:
if not is_safe_unused(name, filepath):
dead.append({
"name": name,
"type": def_type,
"file": filepath,
"line": lineno,
})
return {"definitions": len(definitions), "dead": dead}
def scan_repo(repo_path: str, exclude_patterns: list = None) -> dict:
"""Scan an entire repo for dead code."""
path = Path(repo_path)
exclude = exclude_patterns or ["venv", ".venv", "node_modules", "__pycache__",
".git", "dist", "build", ".tox", "vendor"]
all_definitions = {} # name -> [{file, line, type}]
all_files = []
dead_code = []
# First pass: collect all definitions across repo
for fpath in path.rglob("*.py"):
parts = fpath.parts
if any(ex in parts for ex in exclude):
continue
if fpath.name.startswith("."):
continue
try:
content = fpath.read_text(errors="ignore")
tree = ast.parse(content, filename=str(fpath))
except:
continue
all_files.append(str(fpath))
collector = DefinitionCollector()
collector.visit(tree)
for name, def_type, lineno in collector.definitions:
rel_path = str(fpath.relative_to(path))
if name not in all_definitions:
all_definitions[name] = []
all_definitions[name].append({
"file": rel_path,
"line": lineno,
"type": def_type,
})
# Second pass: check each name for usage across entire repo
all_used_names = set()
for fpath_str in all_files:
try:
content = Path(fpath_str).read_text(errors="ignore")
tree = ast.parse(content)
except:
continue
usage = NameUsageCollector()
usage.visit(tree)
all_used_names.update(usage.names)
all_used_names.update(usage.calls)
all_used_names.update(usage.imports)
# Find dead code
for name, locations in all_definitions.items():
if name not in all_used_names:
for loc in locations:
if not is_safe_unused(name, loc["file"]):
dead_code.append({
"name": name,
"type": loc["type"],
"file": loc["file"],
"line": loc["line"],
})
return {
"repo": path.name,
"files_scanned": len(all_files),
"total_definitions": sum(len(v) for v in all_definitions.values()),
"dead_code_count": len(dead_code),
"dead_code": sorted(dead_code, key=lambda x: (x["file"], x["line"])),
}
def main():
parser = argparse.ArgumentParser(description="Find dead code in Python codebases")
parser.add_argument("repo", help="Repository path to scan")
parser.add_argument("--format", choices=["text", "json"], default="text")
parser.add_argument("--exclude", help="Comma-separated patterns to exclude")
parser.add_argument("--git-blame", action="store_true", help="Include git blame info")
args = parser.parse_args()
exclude = args.exclude.split(",") if args.exclude else None
result = scan_repo(args.repo, exclude)
if args.format == "json":
print(json.dumps(result, indent=2))
else:
print(f"Dead Code Report: {result['repo']}")
print(f"Files scanned: {result['files_scanned']}")
print(f"Total definitions: {result['total_definitions']}")
print(f"Dead code found: {result['dead_code_count']}")
print()
if result["dead_code"]:
print(f"{'File':<45} {'Line':>4} {'Type':<10} {'Name'}")
print("-" * 85)
for item in result["dead_code"]:
author = ""
if args.git_blame:
author = get_git_blame(
os.path.join(args.repo, item["file"]),
item["line"]
) or ""
author = f" ({author})" if author else ""
print(f"{item['file']:<45} {item['line']:>4} {item['type']:<10} {item['name']}{author}")
else:
print("No dead code detected!")
if __name__ == "__main__":
main()

View File

@@ -1,221 +0,0 @@
#!/usr/bin/env python3
"""
Knowledge Store Staleness Detector
Checks knowledge entries against their source files to detect staleness.
An entry is stale when its source file has been modified since extraction.
Usage:
python3 scripts/knowledge_staleness_check.py knowledge/index.json
python3 scripts/knowledge_staleness_check.py --repo /path/to/repo --index knowledge/index.json
python3 scripts/knowledge_staleness_check.py --index knowledge/index.json --fix
Expected index.json format:
{
"version": 1,
"facts": [
{
"fact": "...",
"category": "fact|pitfall|pattern|tool-quirk",
"repo": "repo-name",
"confidence": 0.8,
"source_file": "path/to/file.py",
"source_hash": "sha256:abcdef...",
"extracted_at": "2026-04-13T20:00:00Z"
}
]
}
"""
import argparse
import hashlib
import json
import sys
from pathlib import Path
from typing import Optional
def compute_file_hash(filepath: str) -> Optional[str]:
"""Compute SHA-256 hash of a file. Returns None if file not found."""
path = Path(filepath)
if not path.exists():
return None
content = path.read_bytes()
return hashlib.sha256(content).hexdigest()[:16]
def check_staleness(index_path: str, repo_root: str = None) -> dict:
"""Check all entries in the knowledge index for staleness."""
index = Path(index_path)
if not index.exists():
return {"error": f"Index not found: {index_path}"}
data = json.loads(index.read_text())
facts = data.get("facts", [])
if not facts:
return {
"total": 0,
"stale": 0,
"fresh": 0,
"no_source": 0,
"missing_files": 0,
"stale_entries": [],
}
# Determine repo root
if repo_root:
root = Path(repo_root)
else:
root = index.parent.parent # knowledge/index.json -> repo root
results = {
"total": len(facts),
"stale": 0,
"fresh": 0,
"no_source": 0,
"missing_files": 0,
"stale_entries": [],
}
for i, entry in enumerate(facts):
source_file = entry.get("source_file")
stored_hash = entry.get("source_hash")
if not source_file:
results["no_source"] += 1
continue
if not stored_hash:
# Entry has source file but no hash — consider stale
results["stale"] += 1
results["stale_entries"].append({
"index": i,
"fact": entry.get("fact", "")[:100],
"source_file": source_file,
"reason": "no_hash",
})
continue
# Compute current hash
full_path = root / source_file
current_hash = compute_file_hash(str(full_path))
if current_hash is None:
results["missing_files"] += 1
results["stale_entries"].append({
"index": i,
"fact": entry.get("fact", "")[:100],
"source_file": source_file,
"reason": "file_missing",
})
elif current_hash != stored_hash:
results["stale"] += 1
results["stale_entries"].append({
"index": i,
"fact": entry.get("fact", "")[:100],
"source_file": source_file,
"stored_hash": stored_hash,
"current_hash": current_hash,
"reason": "hash_mismatch",
})
else:
results["fresh"] += 1
return results
def add_hashes_to_index(index_path: str, repo_root: str = None) -> dict:
"""Add source hashes to entries that are missing them."""
index = Path(index_path)
data = json.loads(index.read_text())
facts = data.get("facts", [])
if repo_root:
root = Path(repo_root)
else:
root = index.parent.parent
updated = 0
skipped = 0
for entry in facts:
source_file = entry.get("source_file")
if not source_file or entry.get("source_hash"):
skipped += 1
continue
full_path = root / source_file
file_hash = compute_file_hash(str(full_path))
if file_hash:
entry["source_hash"] = file_hash
updated += 1
if updated > 0:
index.write_text(json.dumps(data, indent=2) + "\n")
return {"updated": updated, "skipped": skipped, "total": len(facts)}
def report_staleness(results: dict) -> str:
"""Format staleness check results as a report."""
lines = []
lines.append("=" * 50)
lines.append("KNOWLEDGE STORE STALENESS REPORT")
lines.append("=" * 50)
lines.append(f"Total entries: {results['total']}")
lines.append(f"Fresh: {results['fresh']}")
lines.append(f"Stale: {results['stale']}")
lines.append(f"No source: {results['no_source']}")
lines.append(f"Missing files: {results['missing_files']}")
lines.append("")
if results["stale_entries"]:
lines.append("STALE ENTRIES:")
lines.append("-" * 50)
for entry in results["stale_entries"]:
lines.append(f" [{entry['reason']}] {entry['source_file']}")
lines.append(f" {entry['fact']}")
if entry.get("stored_hash") and entry.get("current_hash"):
lines.append(f" stored: {entry['stored_hash']}")
lines.append(f" current: {entry['current_hash']}")
lines.append("")
if results["total"] > 0:
staleness_pct = results["stale"] / results["total"] * 100
lines.append(f"Staleness rate: {staleness_pct:.1f}%")
else:
lines.append("No entries to check.")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(description="Check knowledge store for stale entries")
parser.add_argument("--index", default="knowledge/index.json", help="Path to index.json")
parser.add_argument("--repo", help="Repository root (default: auto-detect from index path)")
parser.add_argument("--fix", action="store_true", help="Add missing hashes to index")
parser.add_argument("--json", action="store_true", help="Output JSON instead of report")
args = parser.parse_args()
if args.fix:
result = add_hashes_to_index(args.index, args.repo)
if args.json:
print(json.dumps(result, indent=2))
else:
print(f"Updated {result['updated']} entries with source hashes.")
print(f"Skipped {result['skipped']} (already had hashes or no source file).")
else:
results = check_staleness(args.index, args.repo)
if "error" in results:
print(f"Error: {results['error']}", file=sys.stderr)
sys.exit(1)
if args.json:
print(json.dumps(results, indent=2))
else:
print(report_staleness(results))
if __name__ == "__main__":
main()