Files
timmy-home/scripts/big_brain_repo_audit.py
Alexander Whitestone 303ae44411
Some checks failed
Smoke Test / smoke (push) Failing after 11s
feat: add big-brain nexus audit artifact (#656)
Merge PR #656
2026-04-14 22:18:20 +00:00

281 lines
8.5 KiB
Python

#!/usr/bin/env python3
"""Build a Big Brain audit artifact for a repository via Ollama.
The script creates a markdown context bundle from a repo, prompts an Ollama model
for an architecture/security audit, and writes the final report to disk.
"""
from __future__ import annotations
import argparse
import json
import os
import urllib.request
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterable
IGNORED_DIRS = {
".git",
".hg",
".svn",
".venv",
"venv",
"node_modules",
"__pycache__",
".mypy_cache",
".pytest_cache",
"dist",
"build",
"coverage",
}
TEXT_SUFFIXES = {
".py",
".js",
".mjs",
".cjs",
".ts",
".tsx",
".jsx",
".html",
".css",
".md",
".txt",
".json",
".yaml",
".yml",
".sh",
".ini",
".cfg",
".toml",
}
PRIORITY_FILENAMES = {
"README.md",
"CLAUDE.md",
"POLICY.md",
"DEVELOPMENT.md",
"BROWSER_CONTRACT.md",
"index.html",
"app.js",
"style.css",
"server.py",
"gofai_worker.js",
"provenance.json",
"tests/test_provenance.py",
}
PRIORITY_SNIPPETS = (
"tests/",
"docs/",
"nexus/",
"intelligence/deepdive/",
"scaffold/deepdive/",
"bin/",
)
@dataclass(frozen=True)
class RepoFile:
path: str
abs_path: Path
size_bytes: int
line_count: int
def to_dict(self) -> dict[str, int | str]:
return {
"path": self.path,
"size_bytes": self.size_bytes,
"line_count": self.line_count,
}
def _is_text_file(path: Path) -> bool:
return path.suffix.lower() in TEXT_SUFFIXES or path.name in {"Dockerfile", "Makefile"}
def collect_repo_files(repo_root: str | Path) -> list[dict[str, int | str]]:
root = Path(repo_root).resolve()
files: list[RepoFile] = []
for current_root, dirnames, filenames in os.walk(root):
dirnames[:] = sorted(d for d in dirnames if d not in IGNORED_DIRS)
base = Path(current_root)
for filename in sorted(filenames):
path = base / filename
if not _is_text_file(path):
continue
rel_path = path.relative_to(root).as_posix()
text = path.read_text(errors="replace")
files.append(
RepoFile(
path=rel_path,
abs_path=path,
size_bytes=path.stat().st_size,
line_count=len(text.splitlines()) or 1,
)
)
return [item.to_dict() for item in sorted(files, key=lambda item: item.path)]
def _priority_score(path: str) -> tuple[int, int, str]:
score = 0
if path in PRIORITY_FILENAMES:
score += 100
if any(snippet in path for snippet in PRIORITY_SNIPPETS):
score += 25
if "/" not in path:
score += 20
if path.startswith("tests/"):
score += 10
if path.endswith("README.md"):
score += 10
return (-score, len(path), path)
def _numbered_excerpt(path: Path, max_chars: int) -> str:
lines = path.read_text(errors="replace").splitlines()
rendered: list[str] = []
total = 0
for idx, line in enumerate(lines, start=1):
numbered = f"{idx}|{line}"
if rendered and total + len(numbered) + 1 > max_chars:
rendered.append("...[truncated]...")
break
rendered.append(numbered)
total += len(numbered) + 1
return "\n".join(rendered)
def render_context_bundle(
repo_root: str | Path,
repo_name: str,
max_chars_per_file: int = 6000,
max_total_chars: int = 120000,
) -> str:
root = Path(repo_root).resolve()
files = [
RepoFile(Path(item["path"]).as_posix(), root / str(item["path"]), int(item["size_bytes"]), int(item["line_count"]))
for item in collect_repo_files(root)
]
lines: list[str] = [
f"# Audit Context Bundle — {repo_name}",
"",
f"Generated: {datetime.now(timezone.utc).isoformat()}",
f"Repo root: {root}",
f"Text files indexed: {len(files)}",
"",
"## File manifest",
]
for item in files:
lines.append(f"- {item.path}{item.line_count} lines, {item.size_bytes} bytes")
lines.extend(["", "## Selected file excerpts"])
total_chars = len("\n".join(lines))
for item in sorted(files, key=lambda f: _priority_score(f.path)):
excerpt = _numbered_excerpt(item.abs_path, max_chars_per_file)
block = f"\n### {item.path}\n```text\n{excerpt}\n```\n"
if total_chars + len(block) > max_total_chars:
break
lines.append(f"### {item.path}")
lines.append("```text")
lines.append(excerpt)
lines.append("```")
lines.append("")
total_chars += len(block)
return "\n".join(lines).rstrip() + "\n"
def build_audit_prompt(repo_name: str, context_bundle: str) -> str:
return (
f"You are auditing the repository {repo_name}.\n\n"
"Use only the supplied context bundle. Be concrete, skeptical, and reference file:line locations.\n\n"
"Return markdown with these sections exactly:\n"
"1. Architecture summary\n"
"2. Top 5 structural issues\n"
"3. Top 3 recommended refactors\n"
"4. Security concerns\n"
"5. Follow-on issue candidates\n\n"
"Rules:\n"
"- Every issue and refactor must cite at least one file:line reference.\n"
"- Prefer contradictions, dead code, duplicate ownership, stale docs, brittle boundaries, and unsafe execution paths.\n"
"- If docs and code disagree, say so plainly.\n"
"- Keep it actionable for a Gitea issue/PR workflow.\n\n"
"Context bundle:\n\n"
f"{context_bundle}"
)
def call_ollama_chat(prompt: str, model: str, ollama_url: str, num_ctx: int = 32768, timeout: int = 600) -> str:
payload = json.dumps(
{
"model": model,
"messages": [{"role": "user", "content": prompt}],
"stream": False,
"options": {"num_ctx": num_ctx},
}
).encode()
url = f"{ollama_url.rstrip('/')}/api/chat"
request = urllib.request.Request(url, data=payload, headers={"Content-Type": "application/json"})
with urllib.request.urlopen(request, timeout=timeout) as response:
data = json.loads(response.read().decode())
if "message" in data and isinstance(data["message"], dict):
return data["message"].get("content", "")
if "response" in data:
return str(data["response"])
raise ValueError(f"Unexpected Ollama response shape: {data}")
def generate_audit_report(
repo_root: str | Path,
repo_name: str,
model: str,
ollama_url: str,
num_ctx: int,
context_out: str | Path | None = None,
) -> tuple[str, str]:
context_bundle = render_context_bundle(repo_root, repo_name=repo_name)
if context_out:
context_path = Path(context_out)
context_path.parent.mkdir(parents=True, exist_ok=True)
context_path.write_text(context_bundle)
prompt = build_audit_prompt(repo_name, context_bundle)
report = call_ollama_chat(prompt, model=model, ollama_url=ollama_url, num_ctx=num_ctx)
return context_bundle, report
def main() -> None:
parser = argparse.ArgumentParser(description="Generate a Big Brain repo audit artifact via Ollama")
parser.add_argument("--repo-root", required=True, help="Path to the repository to audit")
parser.add_argument("--repo-name", required=True, help="Repository name, e.g. Timmy_Foundation/the-nexus")
parser.add_argument("--model", default=os.environ.get("BIG_BRAIN_MODEL", "gemma4:latest"))
parser.add_argument("--ollama-url", default=os.environ.get("OLLAMA_URL", "http://localhost:11434"))
parser.add_argument("--num-ctx", type=int, default=int(os.environ.get("BIG_BRAIN_NUM_CTX", "32768")))
parser.add_argument("--context-out", default=None, help="Optional path to save the generated context bundle")
parser.add_argument("--report-out", required=True, help="Path to save the generated markdown audit")
args = parser.parse_args()
_, report = generate_audit_report(
repo_root=args.repo_root,
repo_name=args.repo_name,
model=args.model,
ollama_url=args.ollama_url,
num_ctx=args.num_ctx,
context_out=args.context_out,
)
out_path = Path(args.report_out)
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(report)
print(f"Audit report saved to {out_path}")
if __name__ == "__main__":
main()