Template and analyzer for the Codebase Genome batch (issues #667-683). templates/GENOME-template.md: Reusable template with placeholders for repo name, overview, architecture, entry points, data flow, abstractions, API surface, test coverage, security, and design decisions. scripts/genome_analyzer.py: Auto-generates GENOME.md skeleton from a codebase scan: - File counts by extension - Directory structure (depth 2) - Entry point detection (main.py, scripts/, Makefile, etc.) - Test file enumeration - README first-paragraph extraction Usage: python3 scripts/genome_analyzer.py /path/to/repo --output GENOME.md python3 scripts/genome_analyzer.py /path/to/repo --dry-run
172 lines
6.1 KiB
Python
Executable File
172 lines
6.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
genome_analyzer.py — Generate a GENOME.md from a codebase.
|
|
|
|
Scans a repository and produces a structured codebase genome with:
|
|
- File counts by type
|
|
- Architecture overview (directory structure)
|
|
- Entry points
|
|
- Test coverage summary
|
|
|
|
Usage:
|
|
python3 scripts/genome_analyzer.py /path/to/repo
|
|
python3 scripts/genome_analyzer.py /path/to/repo --output GENOME.md
|
|
python3 scripts/genome_analyzer.py /path/to/repo --dry-run
|
|
|
|
Part of #666: GENOME.md Template + Single-Repo Analyzer.
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
from collections import defaultdict
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Tuple
|
|
|
|
SKIP_DIRS = {".git", "__pycache__", ".venv", "venv", "node_modules", ".tox", ".pytest_cache", ".DS_Store"}
|
|
|
|
|
|
def count_files(repo_path: Path) -> Dict[str, int]:
|
|
counts = defaultdict(int)
|
|
for f in repo_path.rglob("*"):
|
|
if any(part in SKIP_DIRS for part in f.parts):
|
|
continue
|
|
if f.is_file():
|
|
ext = f.suffix or "(no ext)"
|
|
counts[ext] += 1
|
|
return dict(sorted(counts.items(), key=lambda x: -x[1]))
|
|
|
|
|
|
def find_entry_points(repo_path: Path) -> List[str]:
|
|
entry_points = []
|
|
candidates = [
|
|
"main.py", "app.py", "server.py", "cli.py", "manage.py",
|
|
"index.html", "index.js", "index.ts",
|
|
"Makefile", "Dockerfile", "docker-compose.yml",
|
|
"README.md", "deploy.sh", "setup.py", "pyproject.toml",
|
|
]
|
|
for name in candidates:
|
|
if (repo_path / name).exists():
|
|
entry_points.append(name)
|
|
scripts_dir = repo_path / "scripts"
|
|
if scripts_dir.is_dir():
|
|
for f in sorted(scripts_dir.iterdir()):
|
|
if f.suffix in (".py", ".sh") and not f.name.startswith("test_"):
|
|
entry_points.append(f"scripts/{f.name}")
|
|
return entry_points[:15]
|
|
|
|
|
|
def find_tests(repo_path: Path) -> Tuple[List[str], int]:
|
|
test_files = []
|
|
for f in repo_path.rglob("*"):
|
|
if any(part in SKIP_DIRS for part in f.parts):
|
|
continue
|
|
if f.is_file() and (f.name.startswith("test_") or f.name.endswith("_test.py") or f.name.endswith("_test.js")):
|
|
test_files.append(str(f.relative_to(repo_path)))
|
|
return sorted(test_files), len(test_files)
|
|
|
|
|
|
def find_directories(repo_path: Path, max_depth: int = 2) -> List[str]:
|
|
dirs = []
|
|
for d in sorted(repo_path.rglob("*")):
|
|
if d.is_dir() and len(d.relative_to(repo_path).parts) <= max_depth:
|
|
if not any(part in SKIP_DIRS for part in d.parts):
|
|
rel = str(d.relative_to(repo_path))
|
|
if rel != ".":
|
|
dirs.append(rel)
|
|
return dirs[:30]
|
|
|
|
|
|
def read_readme(repo_path: Path) -> str:
|
|
for name in ["README.md", "README.rst", "README.txt", "README"]:
|
|
readme = repo_path / name
|
|
if readme.exists():
|
|
lines = readme.read_text(encoding="utf-8", errors="replace").split("\n")
|
|
para = []
|
|
started = False
|
|
for line in lines:
|
|
if line.startswith("#") and not started:
|
|
continue
|
|
if line.strip():
|
|
started = True
|
|
para.append(line.strip())
|
|
elif started:
|
|
break
|
|
return " ".join(para[:5])
|
|
return "(no README found)"
|
|
|
|
|
|
def generate_genome(repo_path: Path, repo_name: str = "") -> str:
|
|
if not repo_name:
|
|
repo_name = repo_path.name
|
|
date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
readme_desc = read_readme(repo_path)
|
|
file_counts = count_files(repo_path)
|
|
total_files = sum(file_counts.values())
|
|
entry_points = find_entry_points(repo_path)
|
|
test_files, test_count = find_tests(repo_path)
|
|
dirs = find_directories(repo_path)
|
|
|
|
lines = [
|
|
f"# GENOME.md — {repo_name}", "",
|
|
f"> Codebase analysis generated {date}. {readme_desc[:100]}.", "",
|
|
"## Project Overview", "",
|
|
readme_desc, "",
|
|
f"**{total_files} files** across {len(file_counts)} file types.", "",
|
|
"## Architecture", "",
|
|
"```",
|
|
]
|
|
for d in dirs[:20]:
|
|
lines.append(f" {d}/")
|
|
lines.append("```")
|
|
lines += ["", "### File Types", "", "| Type | Count |", "|------|-------|"]
|
|
for ext, count in list(file_counts.items())[:15]:
|
|
lines.append(f"| {ext} | {count} |")
|
|
lines += ["", "## Entry Points", ""]
|
|
for ep in entry_points:
|
|
lines.append(f"- `{ep}`")
|
|
lines += ["", "## Test Coverage", "", f"**{test_count} test files** found.", ""]
|
|
if test_files:
|
|
for tf in test_files[:10]:
|
|
lines.append(f"- `{tf}`")
|
|
if len(test_files) > 10:
|
|
lines.append(f"- ... and {len(test_files) - 10} more")
|
|
else:
|
|
lines.append("No test files found.")
|
|
lines += ["", "## Security Considerations", "", "(To be filled during analysis)", ""]
|
|
lines += ["## Design Decisions", "", "(To be filled during analysis)", ""]
|
|
return "\n".join(lines)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Generate GENOME.md from a codebase")
|
|
parser.add_argument("repo_path", help="Path to repository")
|
|
parser.add_argument("--output", default="", help="Output file (default: stdout)")
|
|
parser.add_argument("--name", default="", help="Repository name")
|
|
parser.add_argument("--dry-run", action="store_true", help="Print stats only")
|
|
args = parser.parse_args()
|
|
repo_path = Path(args.repo_path).resolve()
|
|
if not repo_path.is_dir():
|
|
print(f"ERROR: {repo_path} is not a directory", file=sys.stderr)
|
|
sys.exit(1)
|
|
repo_name = args.name or repo_path.name
|
|
if args.dry_run:
|
|
counts = count_files(repo_path)
|
|
_, test_count = find_tests(repo_path)
|
|
print(f"Repo: {repo_name}")
|
|
print(f"Total files: {sum(counts.values())}")
|
|
print(f"Test files: {test_count}")
|
|
print(f"Top types: {', '.join(f'{k}={v}' for k,v in list(counts.items())[:5])}")
|
|
sys.exit(0)
|
|
genome = generate_genome(repo_path, repo_name)
|
|
if args.output:
|
|
with open(args.output, "w") as f:
|
|
f.write(genome)
|
|
print(f"Written: {args.output}")
|
|
else:
|
|
print(genome)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|