#!/usr/bin/env python3 """ genome_analyzer.py — Generate a GENOME.md from a codebase. Scans a repository and produces a structured codebase genome with: - File counts by type - Architecture overview (directory structure) - Entry points - Test coverage summary Usage: python3 scripts/genome_analyzer.py /path/to/repo python3 scripts/genome_analyzer.py /path/to/repo --output GENOME.md python3 scripts/genome_analyzer.py /path/to/repo --dry-run Part of #666: GENOME.md Template + Single-Repo Analyzer. """ import argparse import sys from collections import defaultdict from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Tuple SKIP_DIRS = {".git", "__pycache__", ".venv", "venv", "node_modules", ".tox", ".pytest_cache", ".DS_Store"} def count_files(repo_path: Path) -> Dict[str, int]: counts = defaultdict(int) for f in repo_path.rglob("*"): if any(part in SKIP_DIRS for part in f.parts): continue if f.is_file(): ext = f.suffix or "(no ext)" counts[ext] += 1 return dict(sorted(counts.items(), key=lambda x: -x[1])) def find_entry_points(repo_path: Path) -> List[str]: entry_points = [] candidates = [ "main.py", "app.py", "server.py", "cli.py", "manage.py", "index.html", "index.js", "index.ts", "Makefile", "Dockerfile", "docker-compose.yml", "README.md", "deploy.sh", "setup.py", "pyproject.toml", ] for name in candidates: if (repo_path / name).exists(): entry_points.append(name) scripts_dir = repo_path / "scripts" if scripts_dir.is_dir(): for f in sorted(scripts_dir.iterdir()): if f.suffix in (".py", ".sh") and not f.name.startswith("test_"): entry_points.append(f"scripts/{f.name}") return entry_points[:15] def find_tests(repo_path: Path) -> Tuple[List[str], int]: test_files = [] for f in repo_path.rglob("*"): if any(part in SKIP_DIRS for part in f.parts): continue if f.is_file() and (f.name.startswith("test_") or f.name.endswith("_test.py") or f.name.endswith("_test.js")): test_files.append(str(f.relative_to(repo_path))) return sorted(test_files), len(test_files) def find_directories(repo_path: Path, max_depth: int = 2) -> List[str]: dirs = [] for d in sorted(repo_path.rglob("*")): if d.is_dir() and len(d.relative_to(repo_path).parts) <= max_depth: if not any(part in SKIP_DIRS for part in d.parts): rel = str(d.relative_to(repo_path)) if rel != ".": dirs.append(rel) return dirs[:30] def read_readme(repo_path: Path) -> str: for name in ["README.md", "README.rst", "README.txt", "README"]: readme = repo_path / name if readme.exists(): lines = readme.read_text(encoding="utf-8", errors="replace").split("\n") para = [] started = False for line in lines: if line.startswith("#") and not started: continue if line.strip(): started = True para.append(line.strip()) elif started: break return " ".join(para[:5]) return "(no README found)" def generate_genome(repo_path: Path, repo_name: str = "") -> str: if not repo_name: repo_name = repo_path.name date = datetime.now(timezone.utc).strftime("%Y-%m-%d") readme_desc = read_readme(repo_path) file_counts = count_files(repo_path) total_files = sum(file_counts.values()) entry_points = find_entry_points(repo_path) test_files, test_count = find_tests(repo_path) dirs = find_directories(repo_path) lines = [ f"# GENOME.md — {repo_name}", "", f"> Codebase analysis generated {date}. {readme_desc[:100]}.", "", "## Project Overview", "", readme_desc, "", f"**{total_files} files** across {len(file_counts)} file types.", "", "## Architecture", "", "```", ] for d in dirs[:20]: lines.append(f" {d}/") lines.append("```") lines += ["", "### File Types", "", "| Type | Count |", "|------|-------|"] for ext, count in list(file_counts.items())[:15]: lines.append(f"| {ext} | {count} |") lines += ["", "## Entry Points", ""] for ep in entry_points: lines.append(f"- `{ep}`") lines += ["", "## Test Coverage", "", f"**{test_count} test files** found.", ""] if test_files: for tf in test_files[:10]: lines.append(f"- `{tf}`") if len(test_files) > 10: lines.append(f"- ... and {len(test_files) - 10} more") else: lines.append("No test files found.") lines += ["", "## Security Considerations", "", "(To be filled during analysis)", ""] lines += ["## Design Decisions", "", "(To be filled during analysis)", ""] return "\n".join(lines) def main(): parser = argparse.ArgumentParser(description="Generate GENOME.md from a codebase") parser.add_argument("repo_path", help="Path to repository") parser.add_argument("--output", default="", help="Output file (default: stdout)") parser.add_argument("--name", default="", help="Repository name") parser.add_argument("--dry-run", action="store_true", help="Print stats only") args = parser.parse_args() repo_path = Path(args.repo_path).resolve() if not repo_path.is_dir(): print(f"ERROR: {repo_path} is not a directory", file=sys.stderr) sys.exit(1) repo_name = args.name or repo_path.name if args.dry_run: counts = count_files(repo_path) _, test_count = find_tests(repo_path) print(f"Repo: {repo_name}") print(f"Total files: {sum(counts.values())}") print(f"Test files: {test_count}") print(f"Top types: {', '.join(f'{k}={v}' for k,v in list(counts.items())[:5])}") sys.exit(0) genome = generate_genome(repo_path, repo_name) if args.output: with open(args.output, "w") as f: f.write(genome) print(f"Written: {args.output}") else: print(genome) if __name__ == "__main__": main()