feat: big-brain repo audit script (#656 )

Reusable Ollama-driven repo audit tool that: - Generates context bundle (directory tree, file stats, key file contents, git stats) - Sends to local LLM for deep architectural analysis - Produces structured audit report (architecture, quality, security, debt) - Fully local — zero cloud credits Usage: python3 scripts/big_brain_repo_audit.py --repo-root /path/to/repo --repo-name Timmy_Foundation/the-nexus --model gemma4:latest --context-out research/big-brain/context-bundle.md --report-out research/big-brain/audit-report.md Includes 12 tests covering tree generation, key file collection, file stats, context bundle building, and helpers. Resolves #575 Closes #656
2026-04-14 07:45:07 -04:00
2 changed files with 451 additions and 0 deletions
--- a/scripts/big_brain_repo_audit.py
+++ b/scripts/big_brain_repo_audit.py
@@ -0,0 +1,350 @@
+#!/usr/bin/env python3
+"""
+Big Brain Repo Audit — Ollama-driven deep codebase audit.
+
+Generates a context bundle from a repo, sends it to a local LLM for
+analysis, and produces a structured audit report.
+
+Usage:
+    python3 big_brain_repo_audit.py --repo-root /path/to/repo --repo-name Owner/repo
+    python3 big_brain_repo_audit.py --repo-root . --repo-name Timmy_Foundation/the-nexus --model gemma4:latest
+    python3 big_brain_repo_audit.py --context-only  # just generate context bundle, no LLM
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+import urllib.request
+from datetime import datetime, timezone
+from pathlib import Path
+
+# ── Config ──────────────────────────────────────────────
+OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
+DEFAULT_MODEL = os.environ.get("AUDIT_MODEL", "gemma4:latest")
+DEFAULT_CTX = int(os.environ.get("AUDIT_NUM_CTX", "32768"))
+
+# Files/dirs to skip during tree generation
+SKIP_DIRS = {
+    ".git", "node_modules", "__pycache__", ".pytest_cache", ".mypy_cache",
+    "venv", ".venv", "env", ".env", "dist", "build", ".tox", ".eggs",
+    "vendor", "target", ".next", ".nuxt", "coverage", ".coverage",
+}
+SKIP_EXTENSIONS = {
+    ".pyc", ".pyo", ".so", ".dylib", ".o", ".a", ".class", ".jar",
+    ".bin", ".exe", ".dll", ".wasm", ".lock",
+    ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico", ".webp",
+    ".mp3", ".mp4", ".wav", ".ogg", ".avi", ".mov",
+    ".zip", ".tar", ".gz", ".bz2", ".7z", ".rar",
+    ".db", ".sqlite", ".sqlite3",
+    ".woff", ".woff2", ".ttf", ".eot",
+}
+
+# Key files to include full content
+KEY_FILES = {
+    "README.md", "README.rst", "README.txt",
+    "CONTRIBUTING.md", "ARCHITECTURE.md", "DESIGN.md",
+    "SOUL.md", "AGENTS.md",
+    "Makefile", "Dockerfile", "docker-compose.yml", "docker-compose.yaml",
+    "pyproject.toml", "setup.py", "setup.cfg",
+    "package.json", "Cargo.toml", "go.mod",
+    ".gitea/workflows/ci.yml",
+}
+
+
+# ── Context Bundle Generation ──────────────────────────
+
+def generate_tree(repo_root: str, max_depth: int = 5) -> str:
+    """Generate a directory tree listing."""
+    lines = []
+    repo_path = Path(repo_root).resolve()
+
+    def walk(path: Path, prefix: str, depth: int):
+        if depth > max_depth:
+            return
+        try:
+            entries = sorted(path.iterdir(), key=lambda e: (not e.is_dir(), e.name.lower()))
+        except PermissionError:
+            return
+
+        dirs = [e for e in entries if e.is_dir() and e.name not in SKIP_DIRS]
+        files = [e for e in entries if e.is_file() and not any(e.name.endswith(ext) for ext in SKIP_EXTENSIONS)]
+
+        for i, d in enumerate(dirs):
+            is_last_dir = (i == len(dirs) - 1) and not files
+            connector = "└── " if is_last_dir else "├── "
+            lines.append(f"{prefix}{connector}{d.name}/")
+            extension = "    " if is_last_dir else "│   "
+            walk(d, prefix + extension, depth + 1)
+
+        for i, f in enumerate(files):
+            is_last = i == len(files) - 1
+            connector = "└── " if is_last else "├── "
+            size = f.stat().st_size
+            size_str = f" ({_human_size(size)})" if size > 1024 else ""
+            lines.append(f"{prefix}{connector}{f.name}{size_str}")
+
+    lines.append(f"{repo_path.name}/")
+    walk(repo_path, "", 1)
+    return "\n".join(lines)
+
+
+def _human_size(n: int) -> str:
+    for unit in ["B", "KB", "MB", "GB"]:
+        if n < 1024:
+            return f"{n:.0f}{unit}" if unit == "B" else f"{n:.1f}{unit}"
+        n /= 1024
+    return f"{n:.1f}TB"
+
+
+def collect_key_files(repo_root: str) -> dict:
+    """Read content of key files."""
+    repo_path = Path(repo_root).resolve()
+    contents = {}
+
+    for key_file in KEY_FILES:
+        path = repo_path / key_file
+        if path.is_file():
+            try:
+                content = path.read_text(errors="replace")
+                # Truncate very long files
+                if len(content) > 5000:
+                    content = content[:5000] + f"\n... [truncated, {len(content)} chars total]"
+                contents[key_file] = content
+            except Exception:
+                pass
+
+    # Also check for key files in common subdirs
+    for subdir in ["", "docs", "src", "lib"]:
+        for name in ["README.md", "ARCHITECTURE.md"]:
+            p = repo_path / subdir / name
+            if p.is_file() and f"{subdir}/{name}" not in contents:
+                try:
+                    content = p.read_text(errors="replace")[:5000]
+                    contents[f"{subdir}/{name}"] = content
+                except Exception:
+                    pass
+
+    return contents
+
+
+def git_stats(repo_root: str) -> dict:
+    """Get basic git statistics."""
+    stats = {}
+    try:
+        r = subprocess.run(["git", "log", "--oneline", "-20"], capture_output=True, text=True, cwd=repo_root, timeout=10)
+        if r.returncode == 0:
+            stats["recent_commits"] = r.stdout.strip()
+    except Exception:
+        pass
+
+    try:
+        r = subprocess.run(["git", "branch", "-a"], capture_output=True, text=True, cwd=repo_root, timeout=10)
+        if r.returncode == 0:
+            stats["branches"] = r.stdout.strip()[:2000]
+    except Exception:
+        pass
+
+    try:
+        r = subprocess.run(["git", "shortlog", "-sn", "--all"], capture_output=True, text=True, cwd=repo_root, timeout=10)
+        if r.returncode == 0:
+            stats["contributors"] = r.stdout.strip()[:1000]
+    except Exception:
+        pass
+
+    return stats
+
+
+def file_stats(repo_root: str) -> dict:
+    """Count files by extension."""
+    ext_counts = {}
+    total_lines = 0
+    repo_path = Path(repo_root).resolve()
+
+    for f in repo_path.rglob("*"):
+        if not f.is_file():
+            continue
+        if any(skip in f.parts for skip in SKIP_DIRS):
+            continue
+        if any(f.name.endswith(ext) for ext in SKIP_EXTENSIONS):
+            continue
+
+        ext = f.suffix or "(no ext)"
+        ext_counts[ext] = ext_counts.get(ext, 0) + 1
+
+        # Count lines for text files
+        if ext in {".py", ".js", ".ts", ".go", ".rs", ".java", ".rb", ".sh", ".md", ".yaml", ".yml", ".json", ".toml"}:
+            try:
+                with open(f, errors="replace") as fh:
+                    total_lines += sum(1 for _ in fh)
+            except Exception:
+                pass
+
+    return {
+        "file_counts": dict(sorted(ext_counts.items(), key=lambda x: -x[1])[:20]),
+        "total_files": sum(ext_counts.values()),
+        "total_code_lines": total_lines,
+    }
+
+
+def build_context_bundle(repo_root: str, repo_name: str) -> str:
+    """Build a markdown context bundle for LLM consumption."""
+    lines = []
+    lines.append(f"# Repo Audit Context: {repo_name}")
+    lines.append(f"\nGenerated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}")
+    lines.append(f"Path: {repo_root}")
+
+    # File stats
+    stats = file_stats(repo_root)
+    lines.append(f"\n## File Statistics")
+    lines.append(f"- Total files: {stats['total_files']}")
+    lines.append(f"- Total code lines: {stats['total_code_lines']}")
+    lines.append(f"\n### Files by Extension")
+    for ext, count in list(stats["file_counts"].items())[:15]:
+        lines.append(f"  {ext}: {count}")
+
+    # Directory tree
+    lines.append(f"\n## Directory Tree")
+    lines.append("```")
+    lines.append(generate_tree(repo_root))
+    lines.append("```")
+
+    # Key files
+    key_contents = collect_key_files(repo_root)
+    if key_contents:
+        lines.append(f"\n## Key Files")
+        for name, content in sorted(key_contents.items()):
+            lines.append(f"\n### {name}")
+            lines.append("```")
+            lines.append(content)
+            lines.append("```")
+
+    # Git stats
+    git = git_stats(repo_root)
+    if git.get("recent_commits"):
+        lines.append(f"\n## Recent Commits")
+        lines.append("```")
+        lines.append(git["recent_commits"])
+        lines.append("```")
+
+    if git.get("contributors"):
+        lines.append(f"\n## Contributors")
+        lines.append("```")
+        lines.append(git["contributors"])
+        lines.append("```")
+
+    return "\n".join(lines)
+
+
+# ── LLM Analysis ───────────────────────────────────────
+
+AUDIT_PROMPT = """You are a senior software architect conducting a deep audit of a codebase.
+
+Analyze the following repo context and produce a structured audit report covering:
+
+1. **Architecture Overview** — What is this project? What are its main components?
+2. **Code Quality** — Patterns, anti-patterns, consistency, test coverage indicators
+3. **Dependencies** — Key dependencies, potential risks, version pinning
+4. **Security** — Obvious security concerns, credential handling, input validation
+5. **Documentation** — Quality of docs, missing documentation
+6. **Technical Debt** — Areas needing refactoring, TODO/FIXME patterns
+7. **Recommendations** — Prioritized list of improvements
+
+## Context Bundle
+
+{CONTEXT}
+
+## Output
+
+Output a structured markdown audit report. Be specific — reference actual files and patterns found.
+Be honest about gaps. Do not hallucinate features that aren't in the context."""
+
+SUMMARY_PROMPT = """Based on this audit report, produce a one-paragraph executive summary and a list of the top 3 most critical issues that should be addressed first.
+
+Audit Report:
+{REPORT}"""
+
+
+def call_ollama(prompt: str, model: str = None, num_ctx: int = DEFAULT_CTX, timeout: int = 300) -> str:
+    model = model or DEFAULT_MODEL
+    payload = json.dumps({
+        "model": model,
+        "messages": [{"role": "user", "content": prompt}],
+        "stream": False,
+        "options": {"temperature": 0.2, "num_ctx": num_ctx},
+    }).encode()
+
+    url = f"{OLLAMA_URL.rstrip('/')}/api/chat"
+    req = urllib.request.Request(url, data=payload, headers={"Content-Type": "application/json"})
+    try:
+        resp = urllib.request.urlopen(req, timeout=timeout)
+        data = json.loads(resp.read())
+        return data.get("message", {}).get("content", "")
+    except Exception as e:
+        return f"ERROR: {e}"
+
+
+# ── CLI ─────────────────────────────────────────────────
+
+def main():
+    p = argparse.ArgumentParser(description="Big Brain Repo Audit")
+    p.add_argument("--repo-root", required=True, help="Path to repo")
+    p.add_argument("--repo-name", default="", help="Owner/repo for report header")
+    p.add_argument("--model", default=DEFAULT_MODEL, help="Ollama model")
+    p.add_argument("--ollama-url", default=OLLAMA_URL)
+    p.add_argument("--num-ctx", type=int, default=DEFAULT_CTX, help="Context window")
+    p.add_argument("--context-out", default="", help="Write context bundle to file")
+    p.add_argument("--report-out", default="", help="Write audit report to file")
+    p.add_argument("--context-only", action="store_true", help="Only generate context, no LLM")
+    args = p.parse_args()
+
+    repo_root = os.path.abspath(args.repo_root)
+    repo_name = args.repo_name or os.path.basename(repo_root)
+
+    if not os.path.isdir(repo_root):
+        print(f"ERROR: {repo_root} not found", file=sys.stderr)
+        sys.exit(1)
+
+    # Generate context
+    print(f"Generating context bundle for {repo_name}...")
+    context = build_context_bundle(repo_root, repo_name)
+    print(f"Context: {len(context)} chars")
+
+    # Write context
+    context_path = args.context_out
+    if context_path:
+        os.makedirs(os.path.dirname(context_path) or ".", exist_ok=True)
+        Path(context_path).write_text(context)
+        print(f"Context written to {context_path}")
+
+    if args.context_only:
+        return
+
+    # Run audit
+    global OLLAMA_URL
+    OLLAMA_URL = args.ollama_url
+
+    print(f"Running audit with {args.model} (ctx={args.num_ctx})...")
+    prompt = AUDIT_PROMPT.format(CONTEXT=context[:60000])  # Truncate if too long
+    report = call_ollama(prompt, model=args.model, num_ctx=args.num_ctx)
+    print(f"Report: {len(report)} chars")
+
+    # Write report
+    report_path = args.report_out
+    if report_path:
+        os.makedirs(os.path.dirname(report_path) or ".", exist_ok=True)
+        Path(report_path).write_text(report)
+        print(f"Report written to {report_path}")
+
+    # Print summary
+    print(f"\n{'='*60}")
+    print(f"  AUDIT COMPLETE: {repo_name}")
+    print(f"{'='*60}")
+    print(report[:2000])
+    if len(report) > 2000:
+        print(f"\n... [{len(report)} chars total, see {report_path or 'stdout above'}]")
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/test_big_brain_repo_audit.py
+++ b/tests/test_big_brain_repo_audit.py
@@ -0,0 +1,101 @@
+"""Tests for scripts.big_brain_repo_audit."""
+
+import json
+import os
+import tempfile
+
+import pytest
+
+
+@pytest.fixture
+def sample_repo(tmp_path):
+    """Create a minimal fake repo for testing."""
+    repo = tmp_path / "test-repo"
+    repo.mkdir()
+
+    (repo / "README.md").write_text("# Test Repo\n\nA test project.")
+    (repo / "src").mkdir()
+    (repo / "src" / "main.py").write_text("def hello():\n    return 'world'\n")
+    (repo / "tests").mkdir()
+    (repo / "tests" / "test_main.py").write_text("def test_hello():\n    assert True\n")
+    (repo / ".git").mkdir()  # fake git dir
+    (repo / ".git" / "config").write_text("[core]\n")
+    (repo / "node_modules").mkdir()  # should be skipped
+    (repo / "node_modules" / "dep.js").write_text("// skipped")
+
+    return str(repo)
+
+
+class TestTreeGeneration:
+    def test_generate_tree(self, sample_repo):
+        from scripts.big_brain_repo_audit import generate_tree
+        tree = generate_tree(sample_repo)
+        assert "test-repo/" in tree
+        assert "README.md" in tree
+        assert "src/" in tree
+        assert "main.py" in tree
+        assert "node_modules" not in tree  # skipped
+
+    def test_generate_tree_skips_git(self, sample_repo):
+        from scripts.big_brain_repo_audit import generate_tree
+        tree = generate_tree(sample_repo)
+        assert ".git/" not in tree
+
+    def test_max_depth(self, sample_repo):
+        from scripts.big_brain_repo_audit import generate_tree
+        tree = generate_tree(sample_repo, max_depth=1)
+        # Should show top-level but not nested content
+        assert "test-repo/" in tree
+        assert "src/" in tree
+
+
+class TestKeyFiles:
+    def test_collects_readme(self, sample_repo):
+        from scripts.big_brain_repo_audit import collect_key_files
+        files = collect_key_files(sample_repo)
+        assert "README.md" in files
+        assert "Test Repo" in files["README.md"]
+
+    def test_missing_files_skipped(self, sample_repo):
+        from scripts.big_brain_repo_audit import collect_key_files
+        files = collect_key_files(sample_repo)
+        assert "Dockerfile" not in files  # doesn't exist
+
+
+class TestFileStats:
+    def test_counts_files(self, sample_repo):
+        from scripts.big_brain_repo_audit import file_stats
+        stats = file_stats(sample_repo)
+        assert stats["total_files"] >= 3
+        assert ".py" in stats["file_counts"]
+        assert stats["total_code_lines"] > 0
+
+
+class TestContextBundle:
+    def test_builds_bundle(self, sample_repo):
+        from scripts.big_brain_repo_audit import build_context_bundle
+        bundle = build_context_bundle(sample_repo, "Test/test-repo")
+        assert "Test/test-repo" in bundle
+        assert "Directory Tree" in bundle
+        assert "README.md" in bundle
+        assert "File Statistics" in bundle
+
+    def test_bundle_is_markdown(self, sample_repo):
+        from scripts.big_brain_repo_audit import build_context_bundle
+        bundle = build_context_bundle(sample_repo, "test/repo")
+        assert bundle.startswith("# ")
+        assert "```" in bundle
+
+
+class TestHumanSize:
+    def test_bytes(self):
+        from scripts.big_brain_repo_audit import _human_size
+        assert _human_size(500) == "500B"
+
+    def test_kb(self):
+        from scripts.big_brain_repo_audit import _human_size
+        assert _human_size(2048) == "2.0KB"
+
+    def test_mb(self):
+        from scripts.big_brain_repo_audit import _human_size
+        assert _human_size(5 * 1024 * 1024) == "5.0MB"