Compare commits
1 Commits
fix/675
...
am/656-177
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5a4d73db7a |
350
scripts/big_brain_repo_audit.py
Executable file
350
scripts/big_brain_repo_audit.py
Executable file
@@ -0,0 +1,350 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Big Brain Repo Audit — Ollama-driven deep codebase audit.
|
||||
|
||||
Generates a context bundle from a repo, sends it to a local LLM for
|
||||
analysis, and produces a structured audit report.
|
||||
|
||||
Usage:
|
||||
python3 big_brain_repo_audit.py --repo-root /path/to/repo --repo-name Owner/repo
|
||||
python3 big_brain_repo_audit.py --repo-root . --repo-name Timmy_Foundation/the-nexus --model gemma4:latest
|
||||
python3 big_brain_repo_audit.py --context-only # just generate context bundle, no LLM
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import urllib.request
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
# ── Config ──────────────────────────────────────────────
|
||||
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
|
||||
DEFAULT_MODEL = os.environ.get("AUDIT_MODEL", "gemma4:latest")
|
||||
DEFAULT_CTX = int(os.environ.get("AUDIT_NUM_CTX", "32768"))
|
||||
|
||||
# Files/dirs to skip during tree generation
|
||||
SKIP_DIRS = {
|
||||
".git", "node_modules", "__pycache__", ".pytest_cache", ".mypy_cache",
|
||||
"venv", ".venv", "env", ".env", "dist", "build", ".tox", ".eggs",
|
||||
"vendor", "target", ".next", ".nuxt", "coverage", ".coverage",
|
||||
}
|
||||
SKIP_EXTENSIONS = {
|
||||
".pyc", ".pyo", ".so", ".dylib", ".o", ".a", ".class", ".jar",
|
||||
".bin", ".exe", ".dll", ".wasm", ".lock",
|
||||
".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico", ".webp",
|
||||
".mp3", ".mp4", ".wav", ".ogg", ".avi", ".mov",
|
||||
".zip", ".tar", ".gz", ".bz2", ".7z", ".rar",
|
||||
".db", ".sqlite", ".sqlite3",
|
||||
".woff", ".woff2", ".ttf", ".eot",
|
||||
}
|
||||
|
||||
# Key files to include full content
|
||||
KEY_FILES = {
|
||||
"README.md", "README.rst", "README.txt",
|
||||
"CONTRIBUTING.md", "ARCHITECTURE.md", "DESIGN.md",
|
||||
"SOUL.md", "AGENTS.md",
|
||||
"Makefile", "Dockerfile", "docker-compose.yml", "docker-compose.yaml",
|
||||
"pyproject.toml", "setup.py", "setup.cfg",
|
||||
"package.json", "Cargo.toml", "go.mod",
|
||||
".gitea/workflows/ci.yml",
|
||||
}
|
||||
|
||||
|
||||
# ── Context Bundle Generation ──────────────────────────
|
||||
|
||||
def generate_tree(repo_root: str, max_depth: int = 5) -> str:
|
||||
"""Generate a directory tree listing."""
|
||||
lines = []
|
||||
repo_path = Path(repo_root).resolve()
|
||||
|
||||
def walk(path: Path, prefix: str, depth: int):
|
||||
if depth > max_depth:
|
||||
return
|
||||
try:
|
||||
entries = sorted(path.iterdir(), key=lambda e: (not e.is_dir(), e.name.lower()))
|
||||
except PermissionError:
|
||||
return
|
||||
|
||||
dirs = [e for e in entries if e.is_dir() and e.name not in SKIP_DIRS]
|
||||
files = [e for e in entries if e.is_file() and not any(e.name.endswith(ext) for ext in SKIP_EXTENSIONS)]
|
||||
|
||||
for i, d in enumerate(dirs):
|
||||
is_last_dir = (i == len(dirs) - 1) and not files
|
||||
connector = "└── " if is_last_dir else "├── "
|
||||
lines.append(f"{prefix}{connector}{d.name}/")
|
||||
extension = " " if is_last_dir else "│ "
|
||||
walk(d, prefix + extension, depth + 1)
|
||||
|
||||
for i, f in enumerate(files):
|
||||
is_last = i == len(files) - 1
|
||||
connector = "└── " if is_last else "├── "
|
||||
size = f.stat().st_size
|
||||
size_str = f" ({_human_size(size)})" if size > 1024 else ""
|
||||
lines.append(f"{prefix}{connector}{f.name}{size_str}")
|
||||
|
||||
lines.append(f"{repo_path.name}/")
|
||||
walk(repo_path, "", 1)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _human_size(n: int) -> str:
|
||||
for unit in ["B", "KB", "MB", "GB"]:
|
||||
if n < 1024:
|
||||
return f"{n:.0f}{unit}" if unit == "B" else f"{n:.1f}{unit}"
|
||||
n /= 1024
|
||||
return f"{n:.1f}TB"
|
||||
|
||||
|
||||
def collect_key_files(repo_root: str) -> dict:
|
||||
"""Read content of key files."""
|
||||
repo_path = Path(repo_root).resolve()
|
||||
contents = {}
|
||||
|
||||
for key_file in KEY_FILES:
|
||||
path = repo_path / key_file
|
||||
if path.is_file():
|
||||
try:
|
||||
content = path.read_text(errors="replace")
|
||||
# Truncate very long files
|
||||
if len(content) > 5000:
|
||||
content = content[:5000] + f"\n... [truncated, {len(content)} chars total]"
|
||||
contents[key_file] = content
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Also check for key files in common subdirs
|
||||
for subdir in ["", "docs", "src", "lib"]:
|
||||
for name in ["README.md", "ARCHITECTURE.md"]:
|
||||
p = repo_path / subdir / name
|
||||
if p.is_file() and f"{subdir}/{name}" not in contents:
|
||||
try:
|
||||
content = p.read_text(errors="replace")[:5000]
|
||||
contents[f"{subdir}/{name}"] = content
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return contents
|
||||
|
||||
|
||||
def git_stats(repo_root: str) -> dict:
|
||||
"""Get basic git statistics."""
|
||||
stats = {}
|
||||
try:
|
||||
r = subprocess.run(["git", "log", "--oneline", "-20"], capture_output=True, text=True, cwd=repo_root, timeout=10)
|
||||
if r.returncode == 0:
|
||||
stats["recent_commits"] = r.stdout.strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
r = subprocess.run(["git", "branch", "-a"], capture_output=True, text=True, cwd=repo_root, timeout=10)
|
||||
if r.returncode == 0:
|
||||
stats["branches"] = r.stdout.strip()[:2000]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
r = subprocess.run(["git", "shortlog", "-sn", "--all"], capture_output=True, text=True, cwd=repo_root, timeout=10)
|
||||
if r.returncode == 0:
|
||||
stats["contributors"] = r.stdout.strip()[:1000]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def file_stats(repo_root: str) -> dict:
|
||||
"""Count files by extension."""
|
||||
ext_counts = {}
|
||||
total_lines = 0
|
||||
repo_path = Path(repo_root).resolve()
|
||||
|
||||
for f in repo_path.rglob("*"):
|
||||
if not f.is_file():
|
||||
continue
|
||||
if any(skip in f.parts for skip in SKIP_DIRS):
|
||||
continue
|
||||
if any(f.name.endswith(ext) for ext in SKIP_EXTENSIONS):
|
||||
continue
|
||||
|
||||
ext = f.suffix or "(no ext)"
|
||||
ext_counts[ext] = ext_counts.get(ext, 0) + 1
|
||||
|
||||
# Count lines for text files
|
||||
if ext in {".py", ".js", ".ts", ".go", ".rs", ".java", ".rb", ".sh", ".md", ".yaml", ".yml", ".json", ".toml"}:
|
||||
try:
|
||||
with open(f, errors="replace") as fh:
|
||||
total_lines += sum(1 for _ in fh)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {
|
||||
"file_counts": dict(sorted(ext_counts.items(), key=lambda x: -x[1])[:20]),
|
||||
"total_files": sum(ext_counts.values()),
|
||||
"total_code_lines": total_lines,
|
||||
}
|
||||
|
||||
|
||||
def build_context_bundle(repo_root: str, repo_name: str) -> str:
|
||||
"""Build a markdown context bundle for LLM consumption."""
|
||||
lines = []
|
||||
lines.append(f"# Repo Audit Context: {repo_name}")
|
||||
lines.append(f"\nGenerated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}")
|
||||
lines.append(f"Path: {repo_root}")
|
||||
|
||||
# File stats
|
||||
stats = file_stats(repo_root)
|
||||
lines.append(f"\n## File Statistics")
|
||||
lines.append(f"- Total files: {stats['total_files']}")
|
||||
lines.append(f"- Total code lines: {stats['total_code_lines']}")
|
||||
lines.append(f"\n### Files by Extension")
|
||||
for ext, count in list(stats["file_counts"].items())[:15]:
|
||||
lines.append(f" {ext}: {count}")
|
||||
|
||||
# Directory tree
|
||||
lines.append(f"\n## Directory Tree")
|
||||
lines.append("```")
|
||||
lines.append(generate_tree(repo_root))
|
||||
lines.append("```")
|
||||
|
||||
# Key files
|
||||
key_contents = collect_key_files(repo_root)
|
||||
if key_contents:
|
||||
lines.append(f"\n## Key Files")
|
||||
for name, content in sorted(key_contents.items()):
|
||||
lines.append(f"\n### {name}")
|
||||
lines.append("```")
|
||||
lines.append(content)
|
||||
lines.append("```")
|
||||
|
||||
# Git stats
|
||||
git = git_stats(repo_root)
|
||||
if git.get("recent_commits"):
|
||||
lines.append(f"\n## Recent Commits")
|
||||
lines.append("```")
|
||||
lines.append(git["recent_commits"])
|
||||
lines.append("```")
|
||||
|
||||
if git.get("contributors"):
|
||||
lines.append(f"\n## Contributors")
|
||||
lines.append("```")
|
||||
lines.append(git["contributors"])
|
||||
lines.append("```")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# ── LLM Analysis ───────────────────────────────────────
|
||||
|
||||
AUDIT_PROMPT = """You are a senior software architect conducting a deep audit of a codebase.
|
||||
|
||||
Analyze the following repo context and produce a structured audit report covering:
|
||||
|
||||
1. **Architecture Overview** — What is this project? What are its main components?
|
||||
2. **Code Quality** — Patterns, anti-patterns, consistency, test coverage indicators
|
||||
3. **Dependencies** — Key dependencies, potential risks, version pinning
|
||||
4. **Security** — Obvious security concerns, credential handling, input validation
|
||||
5. **Documentation** — Quality of docs, missing documentation
|
||||
6. **Technical Debt** — Areas needing refactoring, TODO/FIXME patterns
|
||||
7. **Recommendations** — Prioritized list of improvements
|
||||
|
||||
## Context Bundle
|
||||
|
||||
{CONTEXT}
|
||||
|
||||
## Output
|
||||
|
||||
Output a structured markdown audit report. Be specific — reference actual files and patterns found.
|
||||
Be honest about gaps. Do not hallucinate features that aren't in the context."""
|
||||
|
||||
SUMMARY_PROMPT = """Based on this audit report, produce a one-paragraph executive summary and a list of the top 3 most critical issues that should be addressed first.
|
||||
|
||||
Audit Report:
|
||||
{REPORT}"""
|
||||
|
||||
|
||||
def call_ollama(prompt: str, model: str = None, num_ctx: int = DEFAULT_CTX, timeout: int = 300) -> str:
|
||||
model = model or DEFAULT_MODEL
|
||||
payload = json.dumps({
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"stream": False,
|
||||
"options": {"temperature": 0.2, "num_ctx": num_ctx},
|
||||
}).encode()
|
||||
|
||||
url = f"{OLLAMA_URL.rstrip('/')}/api/chat"
|
||||
req = urllib.request.Request(url, data=payload, headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
resp = urllib.request.urlopen(req, timeout=timeout)
|
||||
data = json.loads(resp.read())
|
||||
return data.get("message", {}).get("content", "")
|
||||
except Exception as e:
|
||||
return f"ERROR: {e}"
|
||||
|
||||
|
||||
# ── CLI ─────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser(description="Big Brain Repo Audit")
|
||||
p.add_argument("--repo-root", required=True, help="Path to repo")
|
||||
p.add_argument("--repo-name", default="", help="Owner/repo for report header")
|
||||
p.add_argument("--model", default=DEFAULT_MODEL, help="Ollama model")
|
||||
p.add_argument("--ollama-url", default=OLLAMA_URL)
|
||||
p.add_argument("--num-ctx", type=int, default=DEFAULT_CTX, help="Context window")
|
||||
p.add_argument("--context-out", default="", help="Write context bundle to file")
|
||||
p.add_argument("--report-out", default="", help="Write audit report to file")
|
||||
p.add_argument("--context-only", action="store_true", help="Only generate context, no LLM")
|
||||
args = p.parse_args()
|
||||
|
||||
repo_root = os.path.abspath(args.repo_root)
|
||||
repo_name = args.repo_name or os.path.basename(repo_root)
|
||||
|
||||
if not os.path.isdir(repo_root):
|
||||
print(f"ERROR: {repo_root} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Generate context
|
||||
print(f"Generating context bundle for {repo_name}...")
|
||||
context = build_context_bundle(repo_root, repo_name)
|
||||
print(f"Context: {len(context)} chars")
|
||||
|
||||
# Write context
|
||||
context_path = args.context_out
|
||||
if context_path:
|
||||
os.makedirs(os.path.dirname(context_path) or ".", exist_ok=True)
|
||||
Path(context_path).write_text(context)
|
||||
print(f"Context written to {context_path}")
|
||||
|
||||
if args.context_only:
|
||||
return
|
||||
|
||||
# Run audit
|
||||
global OLLAMA_URL
|
||||
OLLAMA_URL = args.ollama_url
|
||||
|
||||
print(f"Running audit with {args.model} (ctx={args.num_ctx})...")
|
||||
prompt = AUDIT_PROMPT.format(CONTEXT=context[:60000]) # Truncate if too long
|
||||
report = call_ollama(prompt, model=args.model, num_ctx=args.num_ctx)
|
||||
print(f"Report: {len(report)} chars")
|
||||
|
||||
# Write report
|
||||
report_path = args.report_out
|
||||
if report_path:
|
||||
os.makedirs(os.path.dirname(report_path) or ".", exist_ok=True)
|
||||
Path(report_path).write_text(report)
|
||||
print(f"Report written to {report_path}")
|
||||
|
||||
# Print summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f" AUDIT COMPLETE: {repo_name}")
|
||||
print(f"{'='*60}")
|
||||
print(report[:2000])
|
||||
if len(report) > 2000:
|
||||
print(f"\n... [{len(report)} chars total, see {report_path or 'stdout above'}]")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
101
tests/test_big_brain_repo_audit.py
Normal file
101
tests/test_big_brain_repo_audit.py
Normal file
@@ -0,0 +1,101 @@
|
||||
"""Tests for scripts.big_brain_repo_audit."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_repo(tmp_path):
|
||||
"""Create a minimal fake repo for testing."""
|
||||
repo = tmp_path / "test-repo"
|
||||
repo.mkdir()
|
||||
|
||||
(repo / "README.md").write_text("# Test Repo\n\nA test project.")
|
||||
(repo / "src").mkdir()
|
||||
(repo / "src" / "main.py").write_text("def hello():\n return 'world'\n")
|
||||
(repo / "tests").mkdir()
|
||||
(repo / "tests" / "test_main.py").write_text("def test_hello():\n assert True\n")
|
||||
(repo / ".git").mkdir() # fake git dir
|
||||
(repo / ".git" / "config").write_text("[core]\n")
|
||||
(repo / "node_modules").mkdir() # should be skipped
|
||||
(repo / "node_modules" / "dep.js").write_text("// skipped")
|
||||
|
||||
return str(repo)
|
||||
|
||||
|
||||
class TestTreeGeneration:
|
||||
def test_generate_tree(self, sample_repo):
|
||||
from scripts.big_brain_repo_audit import generate_tree
|
||||
tree = generate_tree(sample_repo)
|
||||
assert "test-repo/" in tree
|
||||
assert "README.md" in tree
|
||||
assert "src/" in tree
|
||||
assert "main.py" in tree
|
||||
assert "node_modules" not in tree # skipped
|
||||
|
||||
def test_generate_tree_skips_git(self, sample_repo):
|
||||
from scripts.big_brain_repo_audit import generate_tree
|
||||
tree = generate_tree(sample_repo)
|
||||
assert ".git/" not in tree
|
||||
|
||||
def test_max_depth(self, sample_repo):
|
||||
from scripts.big_brain_repo_audit import generate_tree
|
||||
tree = generate_tree(sample_repo, max_depth=1)
|
||||
# Should show top-level but not nested content
|
||||
assert "test-repo/" in tree
|
||||
assert "src/" in tree
|
||||
|
||||
|
||||
class TestKeyFiles:
|
||||
def test_collects_readme(self, sample_repo):
|
||||
from scripts.big_brain_repo_audit import collect_key_files
|
||||
files = collect_key_files(sample_repo)
|
||||
assert "README.md" in files
|
||||
assert "Test Repo" in files["README.md"]
|
||||
|
||||
def test_missing_files_skipped(self, sample_repo):
|
||||
from scripts.big_brain_repo_audit import collect_key_files
|
||||
files = collect_key_files(sample_repo)
|
||||
assert "Dockerfile" not in files # doesn't exist
|
||||
|
||||
|
||||
class TestFileStats:
|
||||
def test_counts_files(self, sample_repo):
|
||||
from scripts.big_brain_repo_audit import file_stats
|
||||
stats = file_stats(sample_repo)
|
||||
assert stats["total_files"] >= 3
|
||||
assert ".py" in stats["file_counts"]
|
||||
assert stats["total_code_lines"] > 0
|
||||
|
||||
|
||||
class TestContextBundle:
|
||||
def test_builds_bundle(self, sample_repo):
|
||||
from scripts.big_brain_repo_audit import build_context_bundle
|
||||
bundle = build_context_bundle(sample_repo, "Test/test-repo")
|
||||
assert "Test/test-repo" in bundle
|
||||
assert "Directory Tree" in bundle
|
||||
assert "README.md" in bundle
|
||||
assert "File Statistics" in bundle
|
||||
|
||||
def test_bundle_is_markdown(self, sample_repo):
|
||||
from scripts.big_brain_repo_audit import build_context_bundle
|
||||
bundle = build_context_bundle(sample_repo, "test/repo")
|
||||
assert bundle.startswith("# ")
|
||||
assert "```" in bundle
|
||||
|
||||
|
||||
class TestHumanSize:
|
||||
def test_bytes(self):
|
||||
from scripts.big_brain_repo_audit import _human_size
|
||||
assert _human_size(500) == "500B"
|
||||
|
||||
def test_kb(self):
|
||||
from scripts.big_brain_repo_audit import _human_size
|
||||
assert _human_size(2048) == "2.0KB"
|
||||
|
||||
def test_mb(self):
|
||||
from scripts.big_brain_repo_audit import _human_size
|
||||
assert _human_size(5 * 1024 * 1024) == "5.0MB"
|
||||
Reference in New Issue
Block a user