Compare commits
1 Commits
fix/670
...
fix/666-ge
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b8b8bb65fd |
171
scripts/genome_analyzer.py
Executable file
171
scripts/genome_analyzer.py
Executable file
@@ -0,0 +1,171 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
genome_analyzer.py — Generate a GENOME.md from a codebase.
|
||||
|
||||
Scans a repository and produces a structured codebase genome with:
|
||||
- File counts by type
|
||||
- Architecture overview (directory structure)
|
||||
- Entry points
|
||||
- Test coverage summary
|
||||
|
||||
Usage:
|
||||
python3 scripts/genome_analyzer.py /path/to/repo
|
||||
python3 scripts/genome_analyzer.py /path/to/repo --output GENOME.md
|
||||
python3 scripts/genome_analyzer.py /path/to/repo --dry-run
|
||||
|
||||
Part of #666: GENOME.md Template + Single-Repo Analyzer.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
SKIP_DIRS = {".git", "__pycache__", ".venv", "venv", "node_modules", ".tox", ".pytest_cache", ".DS_Store"}
|
||||
|
||||
|
||||
def count_files(repo_path: Path) -> Dict[str, int]:
|
||||
counts = defaultdict(int)
|
||||
for f in repo_path.rglob("*"):
|
||||
if any(part in SKIP_DIRS for part in f.parts):
|
||||
continue
|
||||
if f.is_file():
|
||||
ext = f.suffix or "(no ext)"
|
||||
counts[ext] += 1
|
||||
return dict(sorted(counts.items(), key=lambda x: -x[1]))
|
||||
|
||||
|
||||
def find_entry_points(repo_path: Path) -> List[str]:
|
||||
entry_points = []
|
||||
candidates = [
|
||||
"main.py", "app.py", "server.py", "cli.py", "manage.py",
|
||||
"index.html", "index.js", "index.ts",
|
||||
"Makefile", "Dockerfile", "docker-compose.yml",
|
||||
"README.md", "deploy.sh", "setup.py", "pyproject.toml",
|
||||
]
|
||||
for name in candidates:
|
||||
if (repo_path / name).exists():
|
||||
entry_points.append(name)
|
||||
scripts_dir = repo_path / "scripts"
|
||||
if scripts_dir.is_dir():
|
||||
for f in sorted(scripts_dir.iterdir()):
|
||||
if f.suffix in (".py", ".sh") and not f.name.startswith("test_"):
|
||||
entry_points.append(f"scripts/{f.name}")
|
||||
return entry_points[:15]
|
||||
|
||||
|
||||
def find_tests(repo_path: Path) -> Tuple[List[str], int]:
|
||||
test_files = []
|
||||
for f in repo_path.rglob("*"):
|
||||
if any(part in SKIP_DIRS for part in f.parts):
|
||||
continue
|
||||
if f.is_file() and (f.name.startswith("test_") or f.name.endswith("_test.py") or f.name.endswith("_test.js")):
|
||||
test_files.append(str(f.relative_to(repo_path)))
|
||||
return sorted(test_files), len(test_files)
|
||||
|
||||
|
||||
def find_directories(repo_path: Path, max_depth: int = 2) -> List[str]:
|
||||
dirs = []
|
||||
for d in sorted(repo_path.rglob("*")):
|
||||
if d.is_dir() and len(d.relative_to(repo_path).parts) <= max_depth:
|
||||
if not any(part in SKIP_DIRS for part in d.parts):
|
||||
rel = str(d.relative_to(repo_path))
|
||||
if rel != ".":
|
||||
dirs.append(rel)
|
||||
return dirs[:30]
|
||||
|
||||
|
||||
def read_readme(repo_path: Path) -> str:
|
||||
for name in ["README.md", "README.rst", "README.txt", "README"]:
|
||||
readme = repo_path / name
|
||||
if readme.exists():
|
||||
lines = readme.read_text(encoding="utf-8", errors="replace").split("\n")
|
||||
para = []
|
||||
started = False
|
||||
for line in lines:
|
||||
if line.startswith("#") and not started:
|
||||
continue
|
||||
if line.strip():
|
||||
started = True
|
||||
para.append(line.strip())
|
||||
elif started:
|
||||
break
|
||||
return " ".join(para[:5])
|
||||
return "(no README found)"
|
||||
|
||||
|
||||
def generate_genome(repo_path: Path, repo_name: str = "") -> str:
|
||||
if not repo_name:
|
||||
repo_name = repo_path.name
|
||||
date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
readme_desc = read_readme(repo_path)
|
||||
file_counts = count_files(repo_path)
|
||||
total_files = sum(file_counts.values())
|
||||
entry_points = find_entry_points(repo_path)
|
||||
test_files, test_count = find_tests(repo_path)
|
||||
dirs = find_directories(repo_path)
|
||||
|
||||
lines = [
|
||||
f"# GENOME.md — {repo_name}", "",
|
||||
f"> Codebase analysis generated {date}. {readme_desc[:100]}.", "",
|
||||
"## Project Overview", "",
|
||||
readme_desc, "",
|
||||
f"**{total_files} files** across {len(file_counts)} file types.", "",
|
||||
"## Architecture", "",
|
||||
"```",
|
||||
]
|
||||
for d in dirs[:20]:
|
||||
lines.append(f" {d}/")
|
||||
lines.append("```")
|
||||
lines += ["", "### File Types", "", "| Type | Count |", "|------|-------|"]
|
||||
for ext, count in list(file_counts.items())[:15]:
|
||||
lines.append(f"| {ext} | {count} |")
|
||||
lines += ["", "## Entry Points", ""]
|
||||
for ep in entry_points:
|
||||
lines.append(f"- `{ep}`")
|
||||
lines += ["", "## Test Coverage", "", f"**{test_count} test files** found.", ""]
|
||||
if test_files:
|
||||
for tf in test_files[:10]:
|
||||
lines.append(f"- `{tf}`")
|
||||
if len(test_files) > 10:
|
||||
lines.append(f"- ... and {len(test_files) - 10} more")
|
||||
else:
|
||||
lines.append("No test files found.")
|
||||
lines += ["", "## Security Considerations", "", "(To be filled during analysis)", ""]
|
||||
lines += ["## Design Decisions", "", "(To be filled during analysis)", ""]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Generate GENOME.md from a codebase")
|
||||
parser.add_argument("repo_path", help="Path to repository")
|
||||
parser.add_argument("--output", default="", help="Output file (default: stdout)")
|
||||
parser.add_argument("--name", default="", help="Repository name")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Print stats only")
|
||||
args = parser.parse_args()
|
||||
repo_path = Path(args.repo_path).resolve()
|
||||
if not repo_path.is_dir():
|
||||
print(f"ERROR: {repo_path} is not a directory", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
repo_name = args.name or repo_path.name
|
||||
if args.dry_run:
|
||||
counts = count_files(repo_path)
|
||||
_, test_count = find_tests(repo_path)
|
||||
print(f"Repo: {repo_name}")
|
||||
print(f"Total files: {sum(counts.values())}")
|
||||
print(f"Test files: {test_count}")
|
||||
print(f"Top types: {', '.join(f'{k}={v}' for k,v in list(counts.items())[:5])}")
|
||||
sys.exit(0)
|
||||
genome = generate_genome(repo_path, repo_name)
|
||||
if args.output:
|
||||
with open(args.output, "w") as f:
|
||||
f.write(genome)
|
||||
print(f"Written: {args.output}")
|
||||
else:
|
||||
print(genome)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
46
templates/GENOME-template.md
Normal file
46
templates/GENOME-template.md
Normal file
@@ -0,0 +1,46 @@
|
||||
# GENOME.md — {{REPO_NAME}}
|
||||
|
||||
> Codebase analysis generated {{DATE}}. {{SHORT_DESCRIPTION}}.
|
||||
|
||||
## Project Overview
|
||||
|
||||
{{OVERVIEW}}
|
||||
|
||||
## Architecture
|
||||
|
||||
{{ARCHITECTURE_DIAGRAM}}
|
||||
|
||||
## Entry Points
|
||||
|
||||
{{ENTRY_POINTS}}
|
||||
|
||||
## Data Flow
|
||||
|
||||
{{DATA_FLOW}}
|
||||
|
||||
## Key Abstractions
|
||||
|
||||
{{ABSTRACTIONS}}
|
||||
|
||||
## API Surface
|
||||
|
||||
{{API_SURFACE}}
|
||||
|
||||
## Test Coverage
|
||||
|
||||
### Existing Tests
|
||||
{{EXISTING_TESTS}}
|
||||
|
||||
### Coverage Gaps
|
||||
{{COVERAGE_GAPS}}
|
||||
|
||||
### Critical paths that need tests:
|
||||
{{CRITICAL_PATHS}}
|
||||
|
||||
## Security Considerations
|
||||
|
||||
{{SECURITY}}
|
||||
|
||||
## Design Decisions
|
||||
|
||||
{{DESIGN_DECISIONS}}
|
||||
Reference in New Issue
Block a user