Compare commits

...

1 Commits

Author SHA1 Message Date
Alexander Whitestone
b8b8bb65fd feat: GENOME.md template + single-repo analyzer (#666)
Some checks are pending
Agent PR Gate / gate (pull_request) Waiting to run
Agent PR Gate / report (pull_request) Blocked by required conditions
Self-Healing Smoke / self-healing-smoke (pull_request) Waiting to run
Smoke Test / smoke (pull_request) Waiting to run
Template and analyzer for the Codebase Genome batch (issues #667-683).

templates/GENOME-template.md:
  Reusable template with placeholders for repo name, overview,
  architecture, entry points, data flow, abstractions, API surface,
  test coverage, security, and design decisions.

scripts/genome_analyzer.py:
  Auto-generates GENOME.md skeleton from a codebase scan:
  - File counts by extension
  - Directory structure (depth 2)
  - Entry point detection (main.py, scripts/, Makefile, etc.)
  - Test file enumeration
  - README first-paragraph extraction

Usage:
  python3 scripts/genome_analyzer.py /path/to/repo --output GENOME.md
  python3 scripts/genome_analyzer.py /path/to/repo --dry-run
2026-04-16 01:19:41 -04:00
2 changed files with 217 additions and 0 deletions

171
scripts/genome_analyzer.py Executable file
View File

@@ -0,0 +1,171 @@
#!/usr/bin/env python3
"""
genome_analyzer.py — Generate a GENOME.md from a codebase.
Scans a repository and produces a structured codebase genome with:
- File counts by type
- Architecture overview (directory structure)
- Entry points
- Test coverage summary
Usage:
python3 scripts/genome_analyzer.py /path/to/repo
python3 scripts/genome_analyzer.py /path/to/repo --output GENOME.md
python3 scripts/genome_analyzer.py /path/to/repo --dry-run
Part of #666: GENOME.md Template + Single-Repo Analyzer.
"""
import argparse
import sys
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Tuple
SKIP_DIRS = {".git", "__pycache__", ".venv", "venv", "node_modules", ".tox", ".pytest_cache", ".DS_Store"}
def count_files(repo_path: Path) -> Dict[str, int]:
counts = defaultdict(int)
for f in repo_path.rglob("*"):
if any(part in SKIP_DIRS for part in f.parts):
continue
if f.is_file():
ext = f.suffix or "(no ext)"
counts[ext] += 1
return dict(sorted(counts.items(), key=lambda x: -x[1]))
def find_entry_points(repo_path: Path) -> List[str]:
entry_points = []
candidates = [
"main.py", "app.py", "server.py", "cli.py", "manage.py",
"index.html", "index.js", "index.ts",
"Makefile", "Dockerfile", "docker-compose.yml",
"README.md", "deploy.sh", "setup.py", "pyproject.toml",
]
for name in candidates:
if (repo_path / name).exists():
entry_points.append(name)
scripts_dir = repo_path / "scripts"
if scripts_dir.is_dir():
for f in sorted(scripts_dir.iterdir()):
if f.suffix in (".py", ".sh") and not f.name.startswith("test_"):
entry_points.append(f"scripts/{f.name}")
return entry_points[:15]
def find_tests(repo_path: Path) -> Tuple[List[str], int]:
test_files = []
for f in repo_path.rglob("*"):
if any(part in SKIP_DIRS for part in f.parts):
continue
if f.is_file() and (f.name.startswith("test_") or f.name.endswith("_test.py") or f.name.endswith("_test.js")):
test_files.append(str(f.relative_to(repo_path)))
return sorted(test_files), len(test_files)
def find_directories(repo_path: Path, max_depth: int = 2) -> List[str]:
dirs = []
for d in sorted(repo_path.rglob("*")):
if d.is_dir() and len(d.relative_to(repo_path).parts) <= max_depth:
if not any(part in SKIP_DIRS for part in d.parts):
rel = str(d.relative_to(repo_path))
if rel != ".":
dirs.append(rel)
return dirs[:30]
def read_readme(repo_path: Path) -> str:
for name in ["README.md", "README.rst", "README.txt", "README"]:
readme = repo_path / name
if readme.exists():
lines = readme.read_text(encoding="utf-8", errors="replace").split("\n")
para = []
started = False
for line in lines:
if line.startswith("#") and not started:
continue
if line.strip():
started = True
para.append(line.strip())
elif started:
break
return " ".join(para[:5])
return "(no README found)"
def generate_genome(repo_path: Path, repo_name: str = "") -> str:
if not repo_name:
repo_name = repo_path.name
date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
readme_desc = read_readme(repo_path)
file_counts = count_files(repo_path)
total_files = sum(file_counts.values())
entry_points = find_entry_points(repo_path)
test_files, test_count = find_tests(repo_path)
dirs = find_directories(repo_path)
lines = [
f"# GENOME.md — {repo_name}", "",
f"> Codebase analysis generated {date}. {readme_desc[:100]}.", "",
"## Project Overview", "",
readme_desc, "",
f"**{total_files} files** across {len(file_counts)} file types.", "",
"## Architecture", "",
"```",
]
for d in dirs[:20]:
lines.append(f" {d}/")
lines.append("```")
lines += ["", "### File Types", "", "| Type | Count |", "|------|-------|"]
for ext, count in list(file_counts.items())[:15]:
lines.append(f"| {ext} | {count} |")
lines += ["", "## Entry Points", ""]
for ep in entry_points:
lines.append(f"- `{ep}`")
lines += ["", "## Test Coverage", "", f"**{test_count} test files** found.", ""]
if test_files:
for tf in test_files[:10]:
lines.append(f"- `{tf}`")
if len(test_files) > 10:
lines.append(f"- ... and {len(test_files) - 10} more")
else:
lines.append("No test files found.")
lines += ["", "## Security Considerations", "", "(To be filled during analysis)", ""]
lines += ["## Design Decisions", "", "(To be filled during analysis)", ""]
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(description="Generate GENOME.md from a codebase")
parser.add_argument("repo_path", help="Path to repository")
parser.add_argument("--output", default="", help="Output file (default: stdout)")
parser.add_argument("--name", default="", help="Repository name")
parser.add_argument("--dry-run", action="store_true", help="Print stats only")
args = parser.parse_args()
repo_path = Path(args.repo_path).resolve()
if not repo_path.is_dir():
print(f"ERROR: {repo_path} is not a directory", file=sys.stderr)
sys.exit(1)
repo_name = args.name or repo_path.name
if args.dry_run:
counts = count_files(repo_path)
_, test_count = find_tests(repo_path)
print(f"Repo: {repo_name}")
print(f"Total files: {sum(counts.values())}")
print(f"Test files: {test_count}")
print(f"Top types: {', '.join(f'{k}={v}' for k,v in list(counts.items())[:5])}")
sys.exit(0)
genome = generate_genome(repo_path, repo_name)
if args.output:
with open(args.output, "w") as f:
f.write(genome)
print(f"Written: {args.output}")
else:
print(genome)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,46 @@
# GENOME.md — {{REPO_NAME}}
> Codebase analysis generated {{DATE}}. {{SHORT_DESCRIPTION}}.
## Project Overview
{{OVERVIEW}}
## Architecture
{{ARCHITECTURE_DIAGRAM}}
## Entry Points
{{ENTRY_POINTS}}
## Data Flow
{{DATA_FLOW}}
## Key Abstractions
{{ABSTRACTIONS}}
## API Surface
{{API_SURFACE}}
## Test Coverage
### Existing Tests
{{EXISTING_TESTS}}
### Coverage Gaps
{{COVERAGE_GAPS}}
### Critical paths that need tests:
{{CRITICAL_PATHS}}
## Security Considerations
{{SECURITY}}
## Design Decisions
{{DESIGN_DECISIONS}}