Compare commits

...

1 Commits

Author SHA1 Message Date
Timmy (AI Agent)
5a4d73db7a feat: big-brain repo audit script (#656)
Some checks failed
Smoke Test / smoke (pull_request) Failing after 26s
Reusable Ollama-driven repo audit tool that:
- Generates context bundle (directory tree, file stats, key file contents, git stats)
- Sends to local LLM for deep architectural analysis
- Produces structured audit report (architecture, quality, security, debt)
- Fully local — zero cloud credits

Usage:
    python3 scripts/big_brain_repo_audit.py         --repo-root /path/to/repo         --repo-name Timmy_Foundation/the-nexus         --model gemma4:latest         --context-out research/big-brain/context-bundle.md         --report-out research/big-brain/audit-report.md

Includes 12 tests covering tree generation, key file collection,
file stats, context bundle building, and helpers.

Resolves #575
Closes #656
2026-04-14 07:45:07 -04:00
2 changed files with 451 additions and 0 deletions

350
scripts/big_brain_repo_audit.py Executable file
View File

@@ -0,0 +1,350 @@
#!/usr/bin/env python3
"""
Big Brain Repo Audit — Ollama-driven deep codebase audit.
Generates a context bundle from a repo, sends it to a local LLM for
analysis, and produces a structured audit report.
Usage:
python3 big_brain_repo_audit.py --repo-root /path/to/repo --repo-name Owner/repo
python3 big_brain_repo_audit.py --repo-root . --repo-name Timmy_Foundation/the-nexus --model gemma4:latest
python3 big_brain_repo_audit.py --context-only # just generate context bundle, no LLM
"""
import argparse
import json
import os
import subprocess
import sys
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
# ── Config ──────────────────────────────────────────────
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
DEFAULT_MODEL = os.environ.get("AUDIT_MODEL", "gemma4:latest")
DEFAULT_CTX = int(os.environ.get("AUDIT_NUM_CTX", "32768"))
# Files/dirs to skip during tree generation
SKIP_DIRS = {
".git", "node_modules", "__pycache__", ".pytest_cache", ".mypy_cache",
"venv", ".venv", "env", ".env", "dist", "build", ".tox", ".eggs",
"vendor", "target", ".next", ".nuxt", "coverage", ".coverage",
}
SKIP_EXTENSIONS = {
".pyc", ".pyo", ".so", ".dylib", ".o", ".a", ".class", ".jar",
".bin", ".exe", ".dll", ".wasm", ".lock",
".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico", ".webp",
".mp3", ".mp4", ".wav", ".ogg", ".avi", ".mov",
".zip", ".tar", ".gz", ".bz2", ".7z", ".rar",
".db", ".sqlite", ".sqlite3",
".woff", ".woff2", ".ttf", ".eot",
}
# Key files to include full content
KEY_FILES = {
"README.md", "README.rst", "README.txt",
"CONTRIBUTING.md", "ARCHITECTURE.md", "DESIGN.md",
"SOUL.md", "AGENTS.md",
"Makefile", "Dockerfile", "docker-compose.yml", "docker-compose.yaml",
"pyproject.toml", "setup.py", "setup.cfg",
"package.json", "Cargo.toml", "go.mod",
".gitea/workflows/ci.yml",
}
# ── Context Bundle Generation ──────────────────────────
def generate_tree(repo_root: str, max_depth: int = 5) -> str:
"""Generate a directory tree listing."""
lines = []
repo_path = Path(repo_root).resolve()
def walk(path: Path, prefix: str, depth: int):
if depth > max_depth:
return
try:
entries = sorted(path.iterdir(), key=lambda e: (not e.is_dir(), e.name.lower()))
except PermissionError:
return
dirs = [e for e in entries if e.is_dir() and e.name not in SKIP_DIRS]
files = [e for e in entries if e.is_file() and not any(e.name.endswith(ext) for ext in SKIP_EXTENSIONS)]
for i, d in enumerate(dirs):
is_last_dir = (i == len(dirs) - 1) and not files
connector = "└── " if is_last_dir else "├── "
lines.append(f"{prefix}{connector}{d.name}/")
extension = " " if is_last_dir else ""
walk(d, prefix + extension, depth + 1)
for i, f in enumerate(files):
is_last = i == len(files) - 1
connector = "└── " if is_last else "├── "
size = f.stat().st_size
size_str = f" ({_human_size(size)})" if size > 1024 else ""
lines.append(f"{prefix}{connector}{f.name}{size_str}")
lines.append(f"{repo_path.name}/")
walk(repo_path, "", 1)
return "\n".join(lines)
def _human_size(n: int) -> str:
for unit in ["B", "KB", "MB", "GB"]:
if n < 1024:
return f"{n:.0f}{unit}" if unit == "B" else f"{n:.1f}{unit}"
n /= 1024
return f"{n:.1f}TB"
def collect_key_files(repo_root: str) -> dict:
"""Read content of key files."""
repo_path = Path(repo_root).resolve()
contents = {}
for key_file in KEY_FILES:
path = repo_path / key_file
if path.is_file():
try:
content = path.read_text(errors="replace")
# Truncate very long files
if len(content) > 5000:
content = content[:5000] + f"\n... [truncated, {len(content)} chars total]"
contents[key_file] = content
except Exception:
pass
# Also check for key files in common subdirs
for subdir in ["", "docs", "src", "lib"]:
for name in ["README.md", "ARCHITECTURE.md"]:
p = repo_path / subdir / name
if p.is_file() and f"{subdir}/{name}" not in contents:
try:
content = p.read_text(errors="replace")[:5000]
contents[f"{subdir}/{name}"] = content
except Exception:
pass
return contents
def git_stats(repo_root: str) -> dict:
"""Get basic git statistics."""
stats = {}
try:
r = subprocess.run(["git", "log", "--oneline", "-20"], capture_output=True, text=True, cwd=repo_root, timeout=10)
if r.returncode == 0:
stats["recent_commits"] = r.stdout.strip()
except Exception:
pass
try:
r = subprocess.run(["git", "branch", "-a"], capture_output=True, text=True, cwd=repo_root, timeout=10)
if r.returncode == 0:
stats["branches"] = r.stdout.strip()[:2000]
except Exception:
pass
try:
r = subprocess.run(["git", "shortlog", "-sn", "--all"], capture_output=True, text=True, cwd=repo_root, timeout=10)
if r.returncode == 0:
stats["contributors"] = r.stdout.strip()[:1000]
except Exception:
pass
return stats
def file_stats(repo_root: str) -> dict:
"""Count files by extension."""
ext_counts = {}
total_lines = 0
repo_path = Path(repo_root).resolve()
for f in repo_path.rglob("*"):
if not f.is_file():
continue
if any(skip in f.parts for skip in SKIP_DIRS):
continue
if any(f.name.endswith(ext) for ext in SKIP_EXTENSIONS):
continue
ext = f.suffix or "(no ext)"
ext_counts[ext] = ext_counts.get(ext, 0) + 1
# Count lines for text files
if ext in {".py", ".js", ".ts", ".go", ".rs", ".java", ".rb", ".sh", ".md", ".yaml", ".yml", ".json", ".toml"}:
try:
with open(f, errors="replace") as fh:
total_lines += sum(1 for _ in fh)
except Exception:
pass
return {
"file_counts": dict(sorted(ext_counts.items(), key=lambda x: -x[1])[:20]),
"total_files": sum(ext_counts.values()),
"total_code_lines": total_lines,
}
def build_context_bundle(repo_root: str, repo_name: str) -> str:
"""Build a markdown context bundle for LLM consumption."""
lines = []
lines.append(f"# Repo Audit Context: {repo_name}")
lines.append(f"\nGenerated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}")
lines.append(f"Path: {repo_root}")
# File stats
stats = file_stats(repo_root)
lines.append(f"\n## File Statistics")
lines.append(f"- Total files: {stats['total_files']}")
lines.append(f"- Total code lines: {stats['total_code_lines']}")
lines.append(f"\n### Files by Extension")
for ext, count in list(stats["file_counts"].items())[:15]:
lines.append(f" {ext}: {count}")
# Directory tree
lines.append(f"\n## Directory Tree")
lines.append("```")
lines.append(generate_tree(repo_root))
lines.append("```")
# Key files
key_contents = collect_key_files(repo_root)
if key_contents:
lines.append(f"\n## Key Files")
for name, content in sorted(key_contents.items()):
lines.append(f"\n### {name}")
lines.append("```")
lines.append(content)
lines.append("```")
# Git stats
git = git_stats(repo_root)
if git.get("recent_commits"):
lines.append(f"\n## Recent Commits")
lines.append("```")
lines.append(git["recent_commits"])
lines.append("```")
if git.get("contributors"):
lines.append(f"\n## Contributors")
lines.append("```")
lines.append(git["contributors"])
lines.append("```")
return "\n".join(lines)
# ── LLM Analysis ───────────────────────────────────────
AUDIT_PROMPT = """You are a senior software architect conducting a deep audit of a codebase.
Analyze the following repo context and produce a structured audit report covering:
1. **Architecture Overview** — What is this project? What are its main components?
2. **Code Quality** — Patterns, anti-patterns, consistency, test coverage indicators
3. **Dependencies** — Key dependencies, potential risks, version pinning
4. **Security** — Obvious security concerns, credential handling, input validation
5. **Documentation** — Quality of docs, missing documentation
6. **Technical Debt** — Areas needing refactoring, TODO/FIXME patterns
7. **Recommendations** — Prioritized list of improvements
## Context Bundle
{CONTEXT}
## Output
Output a structured markdown audit report. Be specific — reference actual files and patterns found.
Be honest about gaps. Do not hallucinate features that aren't in the context."""
SUMMARY_PROMPT = """Based on this audit report, produce a one-paragraph executive summary and a list of the top 3 most critical issues that should be addressed first.
Audit Report:
{REPORT}"""
def call_ollama(prompt: str, model: str = None, num_ctx: int = DEFAULT_CTX, timeout: int = 300) -> str:
model = model or DEFAULT_MODEL
payload = json.dumps({
"model": model,
"messages": [{"role": "user", "content": prompt}],
"stream": False,
"options": {"temperature": 0.2, "num_ctx": num_ctx},
}).encode()
url = f"{OLLAMA_URL.rstrip('/')}/api/chat"
req = urllib.request.Request(url, data=payload, headers={"Content-Type": "application/json"})
try:
resp = urllib.request.urlopen(req, timeout=timeout)
data = json.loads(resp.read())
return data.get("message", {}).get("content", "")
except Exception as e:
return f"ERROR: {e}"
# ── CLI ─────────────────────────────────────────────────
def main():
p = argparse.ArgumentParser(description="Big Brain Repo Audit")
p.add_argument("--repo-root", required=True, help="Path to repo")
p.add_argument("--repo-name", default="", help="Owner/repo for report header")
p.add_argument("--model", default=DEFAULT_MODEL, help="Ollama model")
p.add_argument("--ollama-url", default=OLLAMA_URL)
p.add_argument("--num-ctx", type=int, default=DEFAULT_CTX, help="Context window")
p.add_argument("--context-out", default="", help="Write context bundle to file")
p.add_argument("--report-out", default="", help="Write audit report to file")
p.add_argument("--context-only", action="store_true", help="Only generate context, no LLM")
args = p.parse_args()
repo_root = os.path.abspath(args.repo_root)
repo_name = args.repo_name or os.path.basename(repo_root)
if not os.path.isdir(repo_root):
print(f"ERROR: {repo_root} not found", file=sys.stderr)
sys.exit(1)
# Generate context
print(f"Generating context bundle for {repo_name}...")
context = build_context_bundle(repo_root, repo_name)
print(f"Context: {len(context)} chars")
# Write context
context_path = args.context_out
if context_path:
os.makedirs(os.path.dirname(context_path) or ".", exist_ok=True)
Path(context_path).write_text(context)
print(f"Context written to {context_path}")
if args.context_only:
return
# Run audit
global OLLAMA_URL
OLLAMA_URL = args.ollama_url
print(f"Running audit with {args.model} (ctx={args.num_ctx})...")
prompt = AUDIT_PROMPT.format(CONTEXT=context[:60000]) # Truncate if too long
report = call_ollama(prompt, model=args.model, num_ctx=args.num_ctx)
print(f"Report: {len(report)} chars")
# Write report
report_path = args.report_out
if report_path:
os.makedirs(os.path.dirname(report_path) or ".", exist_ok=True)
Path(report_path).write_text(report)
print(f"Report written to {report_path}")
# Print summary
print(f"\n{'='*60}")
print(f" AUDIT COMPLETE: {repo_name}")
print(f"{'='*60}")
print(report[:2000])
if len(report) > 2000:
print(f"\n... [{len(report)} chars total, see {report_path or 'stdout above'}]")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,101 @@
"""Tests for scripts.big_brain_repo_audit."""
import json
import os
import tempfile
import pytest
@pytest.fixture
def sample_repo(tmp_path):
"""Create a minimal fake repo for testing."""
repo = tmp_path / "test-repo"
repo.mkdir()
(repo / "README.md").write_text("# Test Repo\n\nA test project.")
(repo / "src").mkdir()
(repo / "src" / "main.py").write_text("def hello():\n return 'world'\n")
(repo / "tests").mkdir()
(repo / "tests" / "test_main.py").write_text("def test_hello():\n assert True\n")
(repo / ".git").mkdir() # fake git dir
(repo / ".git" / "config").write_text("[core]\n")
(repo / "node_modules").mkdir() # should be skipped
(repo / "node_modules" / "dep.js").write_text("// skipped")
return str(repo)
class TestTreeGeneration:
def test_generate_tree(self, sample_repo):
from scripts.big_brain_repo_audit import generate_tree
tree = generate_tree(sample_repo)
assert "test-repo/" in tree
assert "README.md" in tree
assert "src/" in tree
assert "main.py" in tree
assert "node_modules" not in tree # skipped
def test_generate_tree_skips_git(self, sample_repo):
from scripts.big_brain_repo_audit import generate_tree
tree = generate_tree(sample_repo)
assert ".git/" not in tree
def test_max_depth(self, sample_repo):
from scripts.big_brain_repo_audit import generate_tree
tree = generate_tree(sample_repo, max_depth=1)
# Should show top-level but not nested content
assert "test-repo/" in tree
assert "src/" in tree
class TestKeyFiles:
def test_collects_readme(self, sample_repo):
from scripts.big_brain_repo_audit import collect_key_files
files = collect_key_files(sample_repo)
assert "README.md" in files
assert "Test Repo" in files["README.md"]
def test_missing_files_skipped(self, sample_repo):
from scripts.big_brain_repo_audit import collect_key_files
files = collect_key_files(sample_repo)
assert "Dockerfile" not in files # doesn't exist
class TestFileStats:
def test_counts_files(self, sample_repo):
from scripts.big_brain_repo_audit import file_stats
stats = file_stats(sample_repo)
assert stats["total_files"] >= 3
assert ".py" in stats["file_counts"]
assert stats["total_code_lines"] > 0
class TestContextBundle:
def test_builds_bundle(self, sample_repo):
from scripts.big_brain_repo_audit import build_context_bundle
bundle = build_context_bundle(sample_repo, "Test/test-repo")
assert "Test/test-repo" in bundle
assert "Directory Tree" in bundle
assert "README.md" in bundle
assert "File Statistics" in bundle
def test_bundle_is_markdown(self, sample_repo):
from scripts.big_brain_repo_audit import build_context_bundle
bundle = build_context_bundle(sample_repo, "test/repo")
assert bundle.startswith("# ")
assert "```" in bundle
class TestHumanSize:
def test_bytes(self):
from scripts.big_brain_repo_audit import _human_size
assert _human_size(500) == "500B"
def test_kb(self):
from scripts.big_brain_repo_audit import _human_size
assert _human_size(2048) == "2.0KB"
def test_mb(self):
from scripts.big_brain_repo_audit import _human_size
assert _human_size(5 * 1024 * 1024) == "5.0MB"