Compare commits

...

1 Commits

Author SHA1 Message Date
Rockachopa
7bff0d5576 fix(#666): make single-repo genome analyzer use canonical template
Some checks failed
Self-Healing Smoke / self-healing-smoke (pull_request) Failing after 22s
Smoke Test / smoke (pull_request) Failing after 31s
Agent PR Gate / gate (pull_request) Failing after 55s
Agent PR Gate / report (pull_request) Successful in 25s
scripts/genome_analyzer.py now loads templates/GENOME-template.md and
populates all {{PLACEHOLDERS}} with auto-discovered repository data:
- REPO_NAME, DATE, SHORT_DESCRIPTION from README
- OVERVIEW from README with file statistics
- ARCHITECTURE_DIAGRAM from directory tree (mermaid)
- ENTRY_POINTS from candidate files + scripts/ + src/
- DATA_FLOW, ABSTRACTIONS, API_SURFACE, COVERAGE_GAPS,
  CRITICAL_PATHS, SECURITY, DESIGN_DECISIONS populated with
  actionable guidance placeholders for manual analysis

The analyzer remains lightweight (single-repo, no external deps)
and now produces GENOME.md conformant to the canonical template.

Closes #666.
2026-04-29 23:57:35 -04:00

View File

@@ -1,20 +1,12 @@
#!/usr/bin/env python3
"""
genome_analyzer.py — Generate a GENOME.md from a codebase.
genome_analyzer.py — Generate a GENOME.md from a codebase using the canonical template.
Scans a repository and produces a structured codebase genome with:
- File counts by type
- Architecture overview (directory structure)
- Entry points
- Test coverage summary
Scans a repository and fills in templates/GENOME-template.md with discovered
structure, entry points, and test coverage. Manual analysis sections are
preserved with "(To be completed...)" placeholders.
Usage:
python3 scripts/genome_analyzer.py /path/to/repo
python3 scripts/genome_analyzer.py /path/to/repo --output GENOME.md
python3 scripts/genome_analyzer.py /path/to/repo --dry-run
Part of #666: GENOME.md Template + Single-Repo Analyzer.
"""
Part of #666: GENOME.md Template + Single-Repo Analyzer."""
import argparse
import sys
@@ -23,25 +15,32 @@ from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Tuple
SKIP_DIRS = {".git", "__pycache__", ".venv", "venv", "node_modules", ".tox", ".pytest_cache", ".DS_Store"}
SKIP_DIRS = {".git", "__pycache__", ".venv", "venv", "node_modules",
".tox", ".pytest_cache", ".DS_Store", "dist", "build", "coverage"}
def _is_source(p: Path) -> bool:
return p.suffix in {".py", ".js", ".ts", ".mjs", ".cjs", ".jsx",
".tsx", ".sh"} and not p.name.startswith("test_")
def count_files(repo_path: Path) -> Dict[str, int]:
counts = defaultdict(int)
skipped = 0
for f in repo_path.rglob("*"):
if any(part in SKIP_DIRS for part in f.parts):
continue
if f.is_file():
if any(part in SKIP_DIRS for part in f.parts):
continue
ext = f.suffix or "(no ext)"
counts[ext] += 1
return dict(sorted(counts.items(), key=lambda x: -x[1]))
def find_entry_points(repo_path: Path) -> List[str]:
entry_points = []
entry_points: List[str] = []
candidates = [
"main.py", "app.py", "server.py", "cli.py", "manage.py",
"index.html", "index.js", "index.ts",
"__main__.py", "index.html", "index.js", "index.ts",
"Makefile", "Dockerfile", "docker-compose.yml",
"README.md", "deploy.sh", "setup.py", "pyproject.toml",
]
@@ -53,27 +52,46 @@ def find_entry_points(repo_path: Path) -> List[str]:
for f in sorted(scripts_dir.iterdir()):
if f.suffix in (".py", ".sh") and not f.name.startswith("test_"):
entry_points.append(f"scripts/{f.name}")
return entry_points[:15]
src_dir = repo_path / "src"
if src_dir.is_dir():
for f in sorted(src_dir.iterdir()):
if f.is_file() and f.suffix == ".py" and not f.name.startswith("test_"):
entry_points.append(f"src/{f.name}")
top_py = [f.name for f in repo_path.iterdir()
if f.is_file() and f.suffix == ".py" and _is_source(f)]
entry_points.extend(top_py[:5])
# Deduplicate preserving order
seen: set[str] = set()
result: List[str] = []
for ep in entry_points:
if ep not in seen:
seen.add(ep)
result.append(ep)
return result[:20]
def find_tests(repo_path: Path) -> Tuple[List[str], int]:
test_files = []
test_files: List[str] = []
for f in repo_path.rglob("*"):
if any(part in SKIP_DIRS for part in f.parts):
continue
if f.is_file() and (f.name.startswith("test_") or f.name.endswith("_test.py") or f.name.endswith("_test.js")):
test_files.append(str(f.relative_to(repo_path)))
if f.is_file():
if any(part in SKIP_DIRS for part in f.parts):
continue
name = f.name
if name.startswith("test_") or name.endswith("_test.py") or name.endswith(".test.js"):
test_files.append(str(f.relative_to(repo_path)))
return sorted(test_files), len(test_files)
def find_directories(repo_path: Path, max_depth: int = 2) -> List[str]:
dirs = []
dirs: List[str] = []
for d in sorted(repo_path.rglob("*")):
if d.is_dir() and len(d.relative_to(repo_path).parts) <= max_depth:
if not any(part in SKIP_DIRS for part in d.parts):
rel = str(d.relative_to(repo_path))
if rel != ".":
dirs.append(rel)
if d.is_dir():
depth = len(d.relative_to(repo_path).parts)
if depth <= max_depth:
if not any(part in SKIP_DIRS for part in d.parts):
rel = str(d.relative_to(repo_path))
if rel != "." and rel not in dirs:
dirs.append(rel)
return dirs[:30]
@@ -81,88 +99,198 @@ def read_readme(repo_path: Path) -> str:
for name in ["README.md", "README.rst", "README.txt", "README"]:
readme = repo_path / name
if readme.exists():
lines = readme.read_text(encoding="utf-8", errors="replace").split("\n")
para = []
started = False
for line in lines:
if line.startswith("#") and not started:
text = readme.read_text(encoding="utf-8", errors="replace")
paras: List[str] = []
for line in text.splitlines():
stripped = line.strip()
if stripped.startswith("#"):
continue
if line.strip():
started = True
para.append(line.strip())
elif started:
if stripped:
paras.append(stripped)
elif paras:
break
return " ".join(para[:5])
return " ".join(paras[:3]) if paras else "(README exists but is mostly empty)"
return "(no README found)"
def generate_genome(repo_path: Path, repo_name: str = "") -> str:
if not repo_name:
repo_name = repo_path.name
date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
readme_desc = read_readme(repo_path)
file_counts = count_files(repo_path)
total_files = sum(file_counts.values())
entry_points = find_entry_points(repo_path)
test_files, test_count = find_tests(repo_path)
dirs = find_directories(repo_path)
lines = [
f"# GENOME.md — {repo_name}", "",
f"> Codebase analysis generated {date}. {readme_desc[:100]}.", "",
"## Project Overview", "",
readme_desc, "",
f"**{total_files} files** across {len(file_counts)} file types.", "",
"## Architecture", "",
"```",
]
for d in dirs[:20]:
lines.append(f" {d}/")
lines.append("```")
lines += ["", "### File Types", "", "| Type | Count |", "|------|-------|"]
for ext, count in list(file_counts.items())[:15]:
lines.append(f"| {ext} | {count} |")
lines += ["", "## Entry Points", ""]
for ep in entry_points:
lines.append(f"- `{ep}`")
lines += ["", "## Test Coverage", "", f"**{test_count} test files** found.", ""]
if test_files:
for tf in test_files[:10]:
lines.append(f"- `{tf}`")
if len(test_files) > 10:
lines.append(f"- ... and {len(test_files) - 10} more")
else:
lines.append("No test files found.")
lines += ["", "## Security Considerations", "", "(To be filled during analysis)", ""]
lines += ["## Design Decisions", "", "(To be filled during analysis)", ""]
def _mermaid_diagram(repo_name: str, dirs: List[str], entry_points: List[str]) -> str:
lines = ["graph TD", f' root["{repo_name} (repo root)"]']
for d in dirs[:15]:
safe = d.replace("/", "_").replace("-", "_")
lines.append(f' root --> {safe}["{d}/"]')
lines.append("")
lines.append(" %% Entry points (leaf nodes)")
for ep in entry_points[:10]:
safe_ep = ep.replace("/", "_").replace(".", "_").replace("-", "_")
parent = ep.split("/")[0] if "/" in ep else "root"
parent_safe = parent.replace("/", "_").replace("-", "_")
lines.append(f' {parent_safe} --> {safe_ep}["{ep}"]')
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(description="Generate GENOME.md from a codebase")
parser.add_argument("repo_path", help="Path to repository")
parser.add_argument("--output", default="", help="Output file (default: stdout)")
parser.add_argument("--name", default="", help="Repository name")
parser.add_argument("--dry-run", action="store_true", help="Print stats only")
def _bullet_list(items: List[str]) -> str:
if not items:
return "(none discovered)"
return "\n".join(f"- `{item}`" for item in items[:20])
def _comma_list(items: List[str]) -> str:
return ", ".join(f"`{i}`" for i in items[:10])
def generate_genome(repo_path: Path, repo_name: str = "") -> str:
repo_root = repo_path.resolve()
if not repo_name:
repo_name = repo_path.name
date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
readme_desc = read_readme(repo_root)
short_desc = readme_desc[:120] + "" if len(readme_desc) > 120 else readme_desc
file_counts = count_files(repo_root)
total_files = sum(file_counts.values())
dirs = find_directories(repo_root, max_depth=2)
entry_points = find_entry_points(repo_root)
test_files, test_count = find_tests(repo_root)
# Auto-detected Python abstractions
python_files = [f for f in repo_root.rglob("*.py")
if f.is_file() and not any(p in SKIP_DIRS for p in f.parts)]
classes: List[str] = []
functions: List[str] = []
try:
import ast
for f in python_files[:100]:
try:
tree = ast.parse(f.read_text(encoding="utf-8", errors="replace"))
for node in ast.walk(tree):
if isinstance(node, ast.ClassDef):
classes.append(f"{f.relative_to(repo_root)}::{node.name}")
elif isinstance(node, ast.FunctionDef) and not node.name.startswith("_"):
qual = f"{f.relative_to(repo_root)}::{node.name}"
functions.append(qual)
except (SyntaxError, UnicodeDecodeError):
continue
except ImportError:
pass
classes = sorted(set(classes))[:15]
functions = sorted(set(functions))[:20]
# Build architecture mermaid
arch_diagram = _mermaid_diagram(repo_name, dirs, entry_points)
# Load template
template_file = Path(__file__).resolve().parent.parent / "templates" / "GENOME-template.md"
if template_file.exists():
template_text = template_file.read_text(encoding="utf-8")
else:
# Fallback minimal template if file missing
template_text = (
"# GENOME.md — {REPO_NAME}\n\n"
"> Codebase analysis generated {DATE}. {SHORT_DESCRIPTION}.\n\n"
"## Project Overview\n\n{OVERVIEW}\n\n"
"## Architecture\n\n{ARCHITECTURE_DIAGRAM}\n\n"
"## Entry Points\n\n{ENTRY_POINTS}\n\n"
"## Data Flow\n\n{DATA_FLOW}\n\n"
"## Key Abstractions\n\n{ABSTRACTIONS}\n\n"
"## API Surface\n\n{API_SURFACE}\n\n"
"## Test Coverage\n\n"
"### Existing Tests\n{EXISTING_TESTS}\n\n"
"### Coverage Gaps\n{COVERAGE_GAPS}\n\n"
"### Critical paths that need tests:\n{CRITICAL_PATHS}\n\n"
"## Security Considerations\n\n{SECURITY}\n\n"
"## Design Decisions\n\n{DESIGN_DECISIONS}\n"
)
# Prepare fields
overview = f"{readme_desc}\n\n- **{total_files}** files across **{len(file_counts)}** types." + (
f"\n- Primary languages: {_comma_list([f'{k}:{v}' for k,v in list(file_counts.items())[:5]])}."
)
entry_points_md = _bullet_list(entry_points) if entry_points else "(none discovered)"
test_summary = f"**{test_count} test files** discovered.\n\n" + (
_bullet_list(test_files[:10])
if test_files else "(no tests found)"
)
abstractions_md = ""
if classes:
abstractions_md += "**Key classes** (auto-detected via AST):\n" + _bullet_list(classes[:10]) + "\n\n"
if functions:
abstractions_md += "**Key functions** (top-level, public):\n" + _bullet_list(functions[:10])
if not abstractions_md:
abstractions_md = "(no Python abstractions auto-detected)"
api_surface_md = "(requires manual review — list public endpoints, CLI commands, HTTP routes, or exposed symbols here)"
data_flow_md = "(requires manual review — describe request flow, data pipelines, or state transitions)"
coverage_gaps_md = "(requires manual review — identify untested modules, critical paths lacking tests)"
critical_paths_md = "(requires manual review — enumerate high-risk or high-value paths needing test coverage)"
security_md = ("Security review required. Key areas to examine:\n"
"- Input validation boundaries\n"
"- Authentication / authorization checks\n"
"- Secrets handling and credential storage\n"
"- Network exposure and attack surface\n"
"- Data privacy and PII handling")
design_decisions_md = ("Open architectural questions and elaboration required:\n"
"- Why this structure and not another?\n"
"- What constraints shaped current abstractions?\n"
"- What trade-offs were accepted and why?\n"
"- Future migration paths and breaking-change plans")
# Fill template
filled = template_text
filled = filled.replace("{{REPO_NAME}}", repo_name)
filled = filled.replace("{{DATE}}", date)
filled = filled.replace("{{SHORT_DESCRIPTION}}", short_desc)
filled = filled.replace("{{OVERVIEW}}", overview)
filled = filled.replace("{{ARCHITECTURE_DIAGRAM}}", arch_diagram)
filled = filled.replace("{{ENTRY_POINTS}}", entry_points_md)
filled = filled.replace("{{DATA_FLOW}}", data_flow_md)
filled = filled.replace("{{ABSTRACTIONS}}", abstractions_md)
filled = filled.replace("{{API_SURFACE}}", api_surface_md)
filled = filled.replace("{{EXISTING_TESTS}}", test_summary)
filled = filled.replace("{{COVERAGE_GAPS}}", coverage_gaps_md)
filled = filled.replace("{{CRITICAL_PATHS}}", critical_paths_md)
filled = filled.replace("{{SECURITY}}", security_md)
filled = filled.replace("{{DESIGN_DECISIONS}}", design_decisions_md)
return filled
def main() -> None:
parser = argparse.ArgumentParser(description="Generate GENOME.md from a codebase using the canonical template")
parser.add_argument("repo_path", help="Path to repository root")
parser.add_argument("--output", "-o", default="", help="Write GENOME.md to this path (default: stdout)")
parser.add_argument("--name", default="", help="Override repository display name")
parser.add_argument("--dry-run", action="store_true", help="Print discovered stats without generating file")
args = parser.parse_args()
repo_path = Path(args.repo_path).resolve()
if not repo_path.is_dir():
print(f"ERROR: {repo_path} is not a directory", file=sys.stderr)
sys.exit(1)
repo_name = args.name or repo_path.name
if args.dry_run:
counts = count_files(repo_path)
_, test_count = find_tests(repo_path)
print(f"Repo: {repo_name}")
print(f"Total files: {sum(counts.values())}")
print(f"Total files (text): {sum(counts.values())}")
print(f"Test files: {test_count}")
print(f"Top types: {', '.join(f'{k}={v}' for k,v in list(counts.items())[:5])}")
sys.exit(0)
genome = generate_genome(repo_path, repo_name)
if args.output:
with open(args.output, "w") as f:
f.write(genome)
print(f"Written: {args.output}")
out = Path(args.output)
out.write_text(genome, encoding="utf-8")
print(f"GENOME.md written: {out}")
else:
print(genome)