timmy-home/scripts/genome_analyzer.py

#!/usr/bin/env python3
"""
genome_analyzer.py — Generate a GENOME.md from a codebase.

Scans a repository and produces a structured codebase genome with:
- File counts by type
- Architecture overview (directory structure)
- Entry points
- Test coverage summary

Usage:
    python3 scripts/genome_analyzer.py /path/to/repo
    python3 scripts/genome_analyzer.py /path/to/repo --output GENOME.md
    python3 scripts/genome_analyzer.py /path/to/repo --dry-run

Part of #666: GENOME.md Template + Single-Repo Analyzer.
"""

import argparse
import sys
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Tuple

SKIP_DIRS = {".git", "__pycache__", ".venv", "venv", "node_modules", ".tox", ".pytest_cache", ".DS_Store"}


def count_files(repo_path: Path) -> Dict[str, int]:
    counts = defaultdict(int)
    for f in repo_path.rglob("*"):
        if any(part in SKIP_DIRS for part in f.parts):
            continue
        if f.is_file():
            ext = f.suffix or "(no ext)"
            counts[ext] += 1
    return dict(sorted(counts.items(), key=lambda x: -x[1]))


def find_entry_points(repo_path: Path) -> List[str]:
    entry_points = []
    candidates = [
        "main.py", "app.py", "server.py", "cli.py", "manage.py",
        "index.html", "index.js", "index.ts",
        "Makefile", "Dockerfile", "docker-compose.yml",
        "README.md", "deploy.sh", "setup.py", "pyproject.toml",
    ]
    for name in candidates:
        if (repo_path / name).exists():
            entry_points.append(name)
    scripts_dir = repo_path / "scripts"
    if scripts_dir.is_dir():
        for f in sorted(scripts_dir.iterdir()):
            if f.suffix in (".py", ".sh") and not f.name.startswith("test_"):
                entry_points.append(f"scripts/{f.name}")
    return entry_points[:15]


def find_tests(repo_path: Path) -> Tuple[List[str], int]:
    test_files = []
    for f in repo_path.rglob("*"):
        if any(part in SKIP_DIRS for part in f.parts):
            continue
        if f.is_file() and (f.name.startswith("test_") or f.name.endswith("_test.py") or f.name.endswith("_test.js")):
            test_files.append(str(f.relative_to(repo_path)))
    return sorted(test_files), len(test_files)


def find_directories(repo_path: Path, max_depth: int = 2) -> List[str]:
    dirs = []
    for d in sorted(repo_path.rglob("*")):
        if d.is_dir() and len(d.relative_to(repo_path).parts) <= max_depth:
            if not any(part in SKIP_DIRS for part in d.parts):
                rel = str(d.relative_to(repo_path))
                if rel != ".":
                    dirs.append(rel)
    return dirs[:30]


def read_readme(repo_path: Path) -> str:
    for name in ["README.md", "README.rst", "README.txt", "README"]:
        readme = repo_path / name
        if readme.exists():
            lines = readme.read_text(encoding="utf-8", errors="replace").split("\n")
            para = []
            started = False
            for line in lines:
                if line.startswith("#") and not started:
                    continue
                if line.strip():
                    started = True
                    para.append(line.strip())
                elif started:
                    break
            return " ".join(para[:5])
    return "(no README found)"


def generate_genome(repo_path: Path, repo_name: str = "") -> str:
    if not repo_name:
        repo_name = repo_path.name
    date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
    readme_desc = read_readme(repo_path)
    file_counts = count_files(repo_path)
    total_files = sum(file_counts.values())
    entry_points = find_entry_points(repo_path)
    test_files, test_count = find_tests(repo_path)
    dirs = find_directories(repo_path)

    lines = [
        f"# GENOME.md — {repo_name}", "",
        f"> Codebase analysis generated {date}. {readme_desc[:100]}.", "",
        "## Project Overview", "",
        readme_desc, "",
        f"**{total_files} files** across {len(file_counts)} file types.", "",
        "## Architecture", "",
        "```",
    ]
    for d in dirs[:20]:
        lines.append(f"  {d}/")
    lines.append("```")
    lines += ["", "### File Types", "", "| Type | Count |", "|------|-------|"]
    for ext, count in list(file_counts.items())[:15]:
        lines.append(f"| {ext} | {count} |")
    lines += ["", "## Entry Points", ""]
    for ep in entry_points:
        lines.append(f"- `{ep}`")
    lines += ["", "## Test Coverage", "", f"**{test_count} test files** found.", ""]
    if test_files:
        for tf in test_files[:10]:
            lines.append(f"- `{tf}`")
        if len(test_files) > 10:
            lines.append(f"- ... and {len(test_files) - 10} more")
    else:
        lines.append("No test files found.")
    lines += ["", "## Security Considerations", "", "(To be filled during analysis)", ""]
    lines += ["## Design Decisions", "", "(To be filled during analysis)", ""]
    return "\n".join(lines)


def main():
    parser = argparse.ArgumentParser(description="Generate GENOME.md from a codebase")
    parser.add_argument("repo_path", help="Path to repository")
    parser.add_argument("--output", default="", help="Output file (default: stdout)")
    parser.add_argument("--name", default="", help="Repository name")
    parser.add_argument("--dry-run", action="store_true", help="Print stats only")
    args = parser.parse_args()
    repo_path = Path(args.repo_path).resolve()
    if not repo_path.is_dir():
        print(f"ERROR: {repo_path} is not a directory", file=sys.stderr)
        sys.exit(1)
    repo_name = args.name or repo_path.name
    if args.dry_run:
        counts = count_files(repo_path)
        _, test_count = find_tests(repo_path)
        print(f"Repo: {repo_name}")
        print(f"Total files: {sum(counts.values())}")
        print(f"Test files: {test_count}")
        print(f"Top types: {', '.join(f'{k}={v}' for k,v in list(counts.items())[:5])}")
        sys.exit(0)
    genome = generate_genome(repo_path, repo_name)
    if args.output:
        with open(args.output, "w") as f:
            f.write(genome)
        print(f"Written: {args.output}")
    else:
        print(genome)


if __name__ == "__main__":
    main()